In [19]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [20]:
sample_doc = "Text analytics is the process of analyzing unstructured text data to derive meaningful insights and patterns."


In [21]:
tokens = word_tokenize(sample_doc)


In [22]:
tokens

['Text',
 'analytics',
 'is',
 'the',
 'process',
 'of',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'to',
 'derive',
 'meaningful',
 'insights',
 'and',
 'patterns',
 '.']

In [25]:
tags = pos_tag(tokens)

In [26]:
tags

[('Text', 'NN'),
 ('analytics', 'NNS'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('process', 'NN'),
 ('of', 'IN'),
 ('analyzing', 'VBG'),
 ('unstructured', 'JJ'),
 ('text', 'NN'),
 ('data', 'NNS'),
 ('to', 'TO'),
 ('derive', 'VB'),
 ('meaningful', 'JJ'),
 ('insights', 'NNS'),
 ('and', 'CC'),
 ('patterns', 'NNS'),
 ('.', '.')]

In [30]:
stopwords_set = set(stopwords.words('english'))

In [35]:
filtered_tokens = [ word for word in tokens if word.lower() not in stopwords_set]

In [36]:
filtered_tokens

['Text',
 'analytics',
 'process',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'derive',
 'meaningful',
 'insights',
 'patterns',
 '.']

In [38]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [39]:
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

In [40]:
stemmed_tokens

['text',
 'analyt',
 'process',
 'analyz',
 'unstructur',
 'text',
 'data',
 'deriv',
 'meaning',
 'insight',
 'pattern',
 '.']

In [44]:
lemmatized_tokens = [ lemmatizer.lemmatize(word) for word in filtered_tokens]

In [45]:
lemmatized_tokens

['Text',
 'analytics',
 'process',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'derive',
 'meaningful',
 'insight',
 'pattern',
 '.']

In [46]:
processed_doc  = "".join(lemmatized_tokens)

In [48]:
processed_doc

'Textanalyticsprocessanalyzingunstructuredtextdataderivemeaningfulinsightpattern.'

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
TfIdf  = TfidfVectorizer()
tfidf_matrix = TfIdf.fit_transform([processed_doc])

In [54]:
tfidf_matrix

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [55]:
feature_names = TfIdf.get_feature_names_out()

In [56]:
feature_names

array(['textanalyticsprocessanalyzingunstructuredtextdataderivemeaningfulinsightpattern'],
      dtype=object)

In [57]:
for col in tfidf_matrix.nonzero()[1]:
    print(f"{feature_names[col]}: {tfidf_matrix[0, col]}")

textanalyticsprocessanalyzingunstructuredtextdataderivemeaningfulinsightpattern: 1.0
