In [1]:
import nltk

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
text="Sachin was the GOAT of the previous generation. Virat is the GOAT of this generation. Shubman will be the GOAT of the next generation."

In [4]:
from nltk.tokenize import sent_tokenize

In [5]:
tokenized_text=sent_tokenize(text)

In [6]:
print(tokenized_text)

['Sachin was the GOAT of the previous generation.', 'Virat is the GOAT of this generation.', 'Shubman will be the GOAT of the next generation.']


In [7]:
from nltk.tokenize import word_tokenize


In [8]:
tokenized_word=word_tokenize(text)

In [9]:
print(tokenized_word)

['Sachin', 'was', 'the', 'GOAT', 'of', 'the', 'previous', 'generation', '.', 'Virat', 'is', 'the', 'GOAT', 'of', 'this', 'generation', '.', 'Shubman', 'will', 'be', 'the', 'GOAT', 'of', 'the', 'next', 'generation', '.']


In [10]:
from nltk.corpus import stopwords

In [11]:
stop_words=set(stopwords.words("english"))


In [12]:
print(stop_words)

{'o', 'themselves', 'all', 'because', 'hadn', 'through', 'whom', "should've", 'having', 'the', 'mightn', 're', 'which', 'below', 'such', 'while', "isn't", 'again', 'own', 'will', "wasn't", 'herself', 'only', 'yourself', 'does', 'too', "you'd", 'about', 'before', 'not', 'further', 'yourselves', 'it', 'an', 'most', 'are', 'so', "won't", "hadn't", "shan't", 'been', 'ain', 'didn', "mustn't", 'couldn', 't', 'than', "mightn't", "shouldn't", 'those', 'y', 'd', 'aren', 'over', 'we', 'nor', 'with', 'during', 'her', "don't", 'shouldn', 'he', 'ourselves', 'itself', 'each', 'off', 'where', 'your', 'until', 'few', "weren't", 'm', 'am', 've', 'against', 'wouldn', 'she', 'won', "doesn't", 'up', 'here', 'hasn', 'no', 'hers', "that'll", "you've", 'they', 'i', 'more', "needn't", 'haven', 'but', 'has', 'now', 'weren', 'of', 'their', 'is', 'this', 'them', 'can', 'ma', 'to', "she's", 'were', 'above', 'was', 'what', 'into', "it's", 'some', 'don', 'you', 'and', 'other', 'have', 'him', 'down', "you'll", 'ours

In [13]:
tokens = word_tokenize(text)
cleaned_tokens = []
for token in tokens:
  if (token not in stop_words):
    cleaned_tokens.append(token)
print(cleaned_tokens)

['Sachin', 'GOAT', 'previous', 'generation', '.', 'Virat', 'GOAT', 'generation', '.', 'Shubman', 'GOAT', 'next', 'generation', '.']


In [14]:
from nltk.stem import PorterStemmer

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [16]:
ps = PorterStemmer()

In [17]:
stemmed_tokens = []
for token in cleaned_tokens:
  stemmed = ps.stem(token)
  stemmed_tokens.append(stemmed)
print(stemmed_tokens)


['sachin', 'goat', 'previou', 'gener', '.', 'virat', 'goat', 'gener', '.', 'shubman', 'goat', 'next', 'gener', '.']


In [18]:
from nltk.stem import WordNetLemmatizer

In [19]:
lemmatizer = WordNetLemmatizer()

In [20]:
lemmatized_tokens = []
for token in cleaned_tokens:
  lemmatized = lemmatizer.lemmatize(token)
  lemmatized_tokens.append(lemmatized)
print(lemmatized_tokens)

['Sachin', 'GOAT', 'previous', 'generation', '.', 'Virat', 'GOAT', 'generation', '.', 'Shubman', 'GOAT', 'next', 'generation', '.']


In [21]:
from nltk import pos_tag

In [22]:
tokens = word_tokenize(text)

In [23]:
print(pos_tag(tokens))

[('Sachin', 'NNP'), ('was', 'VBD'), ('the', 'DT'), ('GOAT', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('previous', 'JJ'), ('generation', 'NN'), ('.', '.'), ('Virat', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('GOAT', 'NNP'), ('of', 'IN'), ('this', 'DT'), ('generation', 'NN'), ('.', '.'), ('Shubman', 'NNP'), ('will', 'MD'), ('be', 'VB'), ('the', 'DT'), ('GOAT', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('next', 'JJ'), ('generation', 'NN'), ('.', '.')]


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
text = [
    "Sachin was the GOAT of the previous generation",
    "Virat is the GOAT of the this generation",
    "Shubman will be the GOAT of the next generation"
]

In [26]:
vectorizer = TfidfVectorizer()

In [27]:
matrix = vectorizer.fit(text)
matrix.vocabulary_

{'sachin': 7,
 'was': 12,
 'the': 9,
 'goat': 2,
 'of': 5,
 'previous': 6,
 'generation': 1,
 'virat': 11,
 'is': 3,
 'this': 10,
 'shubman': 8,
 'will': 13,
 'be': 0,
 'next': 4}

In [28]:
tfidf_matrix = vectorizer.transform(text)
print(tfidf_matrix)

  (0, 12)	0.4286758743128819
  (0, 9)	0.5063657539459899
  (0, 7)	0.4286758743128819
  (0, 6)	0.4286758743128819
  (0, 5)	0.25318287697299496
  (0, 2)	0.25318287697299496
  (0, 1)	0.25318287697299496
  (1, 11)	0.4286758743128819
  (1, 10)	0.4286758743128819
  (1, 9)	0.5063657539459899
  (1, 5)	0.25318287697299496
  (1, 3)	0.4286758743128819
  (1, 2)	0.25318287697299496
  (1, 1)	0.25318287697299496
  (2, 13)	0.39400039808922477
  (2, 9)	0.4654059642457353
  (2, 8)	0.39400039808922477
  (2, 5)	0.23270298212286766
  (2, 4)	0.39400039808922477
  (2, 2)	0.23270298212286766
  (2, 1)	0.23270298212286766
  (2, 0)	0.39400039808922477


In [29]:
print(vectorizer.get_feature_names_out())

['be' 'generation' 'goat' 'is' 'next' 'of' 'previous' 'sachin' 'shubman'
 'the' 'this' 'virat' 'was' 'will']
