In [14]:
# Required Libraries
import nltk
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [15]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Test\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Test\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Test\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Test\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
# Sample Document
document = """Text analytics is a method to extract useful information from unstructured textual data. 
It includes preprocessing steps like tokenization, POS tagging, removing stop words, stemming, and lemmatization."""


In [17]:
# Step 1: Tokenization
tokens = word_tokenize(document)
print("Tokens:\n", tokens)

Tokens:
 ['Text', 'analytics', 'is', 'a', 'method', 'to', 'extract', 'useful', 'information', 'from', 'unstructured', 'textual', 'data', '.', 'It', 'includes', 'preprocessing', 'steps', 'like', 'tokenization', ',', 'POS', 'tagging', ',', 'removing', 'stop', 'words', ',', 'stemming', ',', 'and', 'lemmatization', '.']


In [31]:
# Step 2: POS Tagging
# Download updated tagger manually
import nltk
nltk.download('averaged_perceptron_tagger')  # Legacy fallback

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Test\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [32]:
# Then run POS tagging
from nltk import pos_tag
pos_tags = pos_tag(tokens)  # This should now work

In [33]:
# Step 3: Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
print("\nFiltered Tokens (without Stopwords and Punctuation):\n", filtered_tokens)



Filtered Tokens (without Stopwords and Punctuation):
 ['Text', 'analytics', 'method', 'extract', 'useful', 'information', 'unstructured', 'textual', 'data', 'includes', 'preprocessing', 'steps', 'like', 'tokenization', 'POS', 'tagging', 'removing', 'stop', 'words', 'stemming', 'lemmatization']


In [34]:
# Step 4: Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("\nStemmed Words:\n", stemmed_words)


Stemmed Words:
 ['text', 'analyt', 'method', 'extract', 'use', 'inform', 'unstructur', 'textual', 'data', 'includ', 'preprocess', 'step', 'like', 'token', 'po', 'tag', 'remov', 'stop', 'word', 'stem', 'lemmat']


In [27]:
# Step 5: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nLemmatized Words:\n", lemmatized_words)


Lemmatized Words:
 ['Text', 'analytics', 'method', 'extract', 'useful', 'information', 'unstructured', 'textual', 'data', 'includes', 'preprocessing', 'step', 'like', 'tokenization', 'POS', 'tagging', 'removing', 'stop', 'word', 'stemming', 'lemmatization']


In [35]:
# Step 6: TF and IDF Calculation using TfidfVectorizer
corpus = [document]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [36]:
# Create DataFrame for TF-IDF Scores
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print("\nTF-IDF Representation:\n")
print(df_tfidf)


TF-IDF Representation:

   analytics       and      data   extract      from  includes  information  \
0   0.196116  0.196116  0.196116  0.196116  0.196116  0.196116     0.196116   

         is        it  lemmatization  ...     steps      stop   tagging  \
0  0.196116  0.196116       0.196116  ...  0.196116  0.196116  0.196116   

       text   textual        to  tokenization  unstructured    useful  \
0  0.196116  0.196116  0.196116      0.196116      0.196116  0.196116   

      words  
0  0.196116  

[1 rows x 26 columns]
