In [19]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [20]:
sample_doc = "Text analytics is the process of analyzing unstructured text data to derive meaningful insights and patterns."


In [21]:
tokens = word_tokenize(sample_doc)


In [22]:
tokens

['Text',
 'analytics',
 'is',
 'the',
 'process',
 'of',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'to',
 'derive',
 'meaningful',
 'insights',
 'and',
 'patterns',
 '.']

In [25]:
tags = pos_tag(tokens)

In [26]:
tags

[('Text', 'NN'),
 ('analytics', 'NNS'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('process', 'NN'),
 ('of', 'IN'),
 ('analyzing', 'VBG'),
 ('unstructured', 'JJ'),
 ('text', 'NN'),
 ('data', 'NNS'),
 ('to', 'TO'),
 ('derive', 'VB'),
 ('meaningful', 'JJ'),
 ('insights', 'NNS'),
 ('and', 'CC'),
 ('patterns', 'NNS'),
 ('.', '.')]

In [30]:
stopwords_set = set(stopwords.words('english'))

In [35]:
filtered_tokens = [ word for word in tokens if word.lower() not in stopwords_set]

In [36]:
filtered_tokens

['Text',
 'analytics',
 'process',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'derive',
 'meaningful',
 'insights',
 'patterns',
 '.']

In [38]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [39]:
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

In [40]:
stemmed_tokens

['text',
 'analyt',
 'process',
 'analyz',
 'unstructur',
 'text',
 'data',
 'deriv',
 'meaning',
 'insight',
 'pattern',
 '.']

In [44]:
lemmatized_tokens = [ lemmatizer.lemmatize(word) for word in filtered_tokens]

In [45]:
lemmatized_tokens

['Text',
 'analytics',
 'process',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'derive',
 'meaningful',
 'insight',
 'pattern',
 '.']

In [46]:
processed_doc  = "".join(lemmatized_tokens)

In [48]:
processed_doc

'Textanalyticsprocessanalyzingunstructuredtextdataderivemeaningfulinsightpattern.'

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:

corpus = [
    "Sachin was the GOAT of the previous generation",
    "Virat is the GOAT of the this generation",
    "Shubman will be the GOAT of the next generation"
]

In [6]:
vectorizer  = TfidfVectorizer()
matrix = vectorizer.fit(corpus)


In [7]:
matrix.vocabulary_

{'sachin': 7,
 'was': 12,
 'the': 9,
 'goat': 2,
 'of': 5,
 'previous': 6,
 'generation': 1,
 'virat': 11,
 'is': 3,
 'this': 10,
 'shubman': 8,
 'will': 13,
 'be': 0,
 'next': 4}

In [8]:

tfidf_matrix = vectorizer.transform(corpus)
print(tfidf_matrix)

  (0, 12)	0.4286758743128819
  (0, 9)	0.5063657539459899
  (0, 7)	0.4286758743128819
  (0, 6)	0.4286758743128819
  (0, 5)	0.25318287697299496
  (0, 2)	0.25318287697299496
  (0, 1)	0.25318287697299496
  (1, 11)	0.4286758743128819
  (1, 10)	0.4286758743128819
  (1, 9)	0.5063657539459899
  (1, 5)	0.25318287697299496
  (1, 3)	0.4286758743128819
  (1, 2)	0.25318287697299496
  (1, 1)	0.25318287697299496
  (2, 13)	0.39400039808922477
  (2, 9)	0.4654059642457353
  (2, 8)	0.39400039808922477
  (2, 5)	0.23270298212286766
  (2, 4)	0.39400039808922477
  (2, 2)	0.23270298212286766
  (2, 1)	0.23270298212286766
  (2, 0)	0.39400039808922477


In [9]:

print(vectorizer.get_feature_names_out())

['be' 'generation' 'goat' 'is' 'next' 'of' 'previous' 'sachin' 'shubman'
 'the' 'this' 'virat' 'was' 'will']
