In [1]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

document = "The quick brown fox jumped over the lazy dog. The dog slept over the verandah."

tokens = word_tokenize(document)

print(tokens)


['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '.', 'The', 'dog', 'slept', 'over', 'the', 'verandah', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
nltk.download('averaged_perceptron_tagger')

from nltk import pos_tag

pos_tags = pos_tag(tokens)

print(pos_tags)


[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('The', 'DT'), ('dog', 'NN'), ('slept', 'VBD'), ('over', 'IN'), ('the', 'DT'), ('verandah', 'NN'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

print(filtered_tokens)


['quick', 'brown', 'fox', 'jumped', 'lazy', 'dog', '.', 'dog', 'slept', 'verandah', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet.VERB) for token in filtered_tokens]

print(stemmed_tokens)
print(lemmatized_tokens)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '.', 'dog', 'slept', 'verandah', '.']
['quick', 'brown', 'fox', 'jump', 'lazy', 'dog', '.', 'dog', 'sleep', 'verandah', '.']


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

document = "The quick brown fox jumped over the lazy dog. The dog slept over the verandah."

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform([document])

print(tfidf_matrix)


  (0, 9)	0.1796053020267749
  (0, 7)	0.1796053020267749
  (0, 1)	0.3592106040535498
  (0, 4)	0.1796053020267749
  (0, 5)	0.3592106040535498
  (0, 3)	0.1796053020267749
  (0, 2)	0.1796053020267749
  (0, 0)	0.1796053020267749
  (0, 6)	0.1796053020267749
  (0, 8)	0.7184212081070996


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

document = "The quick brown fox jumped over the lazy dog. The dog slept over the verandah."

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform([document])

print(tfidf_matrix)

feature_names = tfidf_vectorizer.vocabulary_

for word in sorted(feature_names.keys()):
    print(word, feature_names[word])


  (0, 9)	0.1796053020267749
  (0, 7)	0.1796053020267749
  (0, 1)	0.3592106040535498
  (0, 4)	0.1796053020267749
  (0, 5)	0.3592106040535498
  (0, 3)	0.1796053020267749
  (0, 2)	0.1796053020267749
  (0, 0)	0.1796053020267749
  (0, 6)	0.1796053020267749
  (0, 8)	0.7184212081070996
brown 0
dog 1
fox 2
jumped 3
lazy 4
over 5
quick 6
slept 7
the 8
verandah 9
