In [1]:
import nltk
import os

nltk.data.path.append(os.path.abspath('./nltk_data'))

nltk.download('punkt_tab', download_dir='./nltk_data')
nltk.download('stopwords', download_dir='./nltk_data') 
nltk.download('averaged_perceptron_tagger_eng', download_dir='./nltk_data')

[nltk_data] Downloading package punkt_tab to ./nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     ./nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [2]:
text = "It is a truth universally acknowledged, that a single man in possession of a good fortune,  must be in want of a wife." 
text = text.lower() 
print(text) 

it is a truth universally acknowledged, that a single man in possession of a good fortune,  must be in want of a wife.


In [3]:
import string 
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [4]:
text_p = "".join([char for char in text if char not in string.punctuation])
print(text_p)

it is a truth universally acknowledged that a single man in possession of a good fortune  must be in want of a wife


In [5]:
from nltk import word_tokenize, sent_tokenize

# Tokenize
words = word_tokenize(text_p) 
sent = sent_tokenize(text_p) 
print(words) 
print(sent)

['it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man', 'in', 'possession', 'of', 'a', 'good', 'fortune', 'must', 'be', 'in', 'want', 'of', 'a', 'wife']
['it is a truth universally acknowledged that a single man in possession of a good fortune  must be in want of a wife']


In [6]:
from nltk.corpus import stopwords 
stop_words = stopwords.words('english') 
print(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [7]:
filtered_words = [word for word in words if word not in stop_words] 
print(filtered_words)

['truth', 'universally', 'acknowledged', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife']


In [8]:
from nltk.stem.porter import PorterStemmer 
porter = PorterStemmer() 
stemmed = [porter.stem(word) for word in filtered_words] 
print(stemmed)

['truth', 'univers', 'acknowledg', 'singl', 'man', 'possess', 'good', 'fortun', 'must', 'want', 'wife']


In [9]:
from nltk import pos_tag 
pos = pos_tag(filtered_words) 
print(pos) 

[('truth', 'NN'), ('universally', 'RB'), ('acknowledged', 'VBD'), ('single', 'JJ'), ('man', 'NN'), ('possession', 'NN'), ('good', 'JJ'), ('fortune', 'NN'), ('must', 'MD'), ('want', 'VB'), ('wife', 'NN')]


In [10]:
# 5. Calculate TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [text]
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

In [11]:
# Get feature names (terms)
terms = tfidf_vectorizer.get_feature_names_out()
print("TF-IDF Terms:", terms)

TF-IDF Terms: ['acknowledged' 'fortune' 'good' 'man' 'possession' 'single' 'truth'
 'universally' 'want' 'wife']


In [12]:
# Display TF-IDF values for the document
tfidf_values = tfidf_matrix.toarray()
print("TF-IDF Values:", tfidf_values)

TF-IDF Values: [[0.31622777 0.31622777 0.31622777 0.31622777 0.31622777 0.31622777
  0.31622777 0.31622777 0.31622777 0.31622777]]
