In [41]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vedant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Vedant\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vedant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vedant\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
document = "The quick brown fox jumps over the lazy dog. The dog barked loudly."

In [47]:
tokens = word_tokenize(document)
print("\nTokens:")
print(tokens)


Tokens:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'The', 'dog', 'barked', 'loudly', '.']


In [48]:
pos_tags = pos_tag(tokens)
print("\nPOS Tags:")
print(pos_tags)


POS Tags:
[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('The', 'DT'), ('dog', 'NN'), ('barked', 'VBD'), ('loudly', 'RB'), ('.', '.')]


In [49]:
stop_words = set(stopwords.words('english'))
tokens_without_stopwords = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
print("\nAfter Stopwords Removal:")
print(tokens_without_stopwords)


After Stopwords Removal:
['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', 'dog', 'barked', 'loudly']


In [50]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens_without_stopwords]
print("\nAfter Stemming:")
print(stemmed_words)


After Stemming:
['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', 'dog', 'bark', 'loudli']


In [51]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in tokens_without_stopwords]
print("\nAfter Lemmatization:")
print(lemmatized_words)


After Lemmatization:
['quick', 'brown', 'fox', 'jump', 'lazy', 'dog', 'dog', 'barked', 'loudly']


In [52]:
processed_document = ' '.join(lemmatized_words)

In [53]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([processed_document])
features = vectorizer.get_feature_names_out()
tfidf_array = tfidf_matrix.toarray()

In [54]:
print("\nTF-IDF Feature Names:")
print(features)

print("\nTF-IDF Matrix:")
print(tfidf_array)


TF-IDF Feature Names:
['barked' 'brown' 'dog' 'fox' 'jump' 'lazy' 'loudly' 'quick']

TF-IDF Matrix:
[[0.30151134 0.30151134 0.60302269 0.30151134 0.30151134 0.30151134
  0.30151134 0.30151134]]
