In [29]:
# importing dependencies.
import nltk
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag

In [30]:
# download NLTK datasets.
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/parimal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/parimal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/parimal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/parimal/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /Users/parimal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
# Input Text
text = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."

# Convert to lowercase
text = text.lower()
text

'it is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.'

In [32]:
# Remove Punctuation
text_p = "".join([char for char in text if char not in string.punctuation])
print("Text without Punctuation:\n", text_p)

Text without Punctuation:
 it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife


In [33]:
words = word_tokenize(text_p)
print("Tokens:\n", words)

Tokens:
 ['it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man', 'in', 'possession', 'of', 'a', 'good', 'fortune', 'must', 'be', 'in', 'want', 'of', 'a', 'wife']


In [34]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
print("Filtered Words (Stopwords Removed):\n", filtered_words)

Filtered Words (Stopwords Removed):
 ['truth', 'universally', 'acknowledged', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife']


In [35]:
# Stemming.
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("Stemmed Words:\n", stemmed_words)

Stemmed Words:
 ['truth', 'univers', 'acknowledg', 'singl', 'man', 'possess', 'good', 'fortun', 'must', 'want', 'wife']


In [36]:
# Lemmatizing.
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Lemmatized Words:\n", lemmatized_words)

Lemmatized Words:
 ['truth', 'universally', 'acknowledged', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife']


In [37]:
# Parts of Speech Tagging.
pos_tags = pos_tag(filtered_words)
print("POS Tags are:\n", pos_tags)

POS Tags are:
 [('truth', 'NN'), ('universally', 'RB'), ('acknowledged', 'VBD'), ('single', 'JJ'), ('man', 'NN'), ('possession', 'NN'), ('good', 'JJ'), ('fortune', 'NN'), ('must', 'MD'), ('want', 'VB'), ('wife', 'NN')]


In [42]:
# Term Frequency.
from collections import Counter
import math

term_counts = Counter(words)
total_terms = len(words)
tf = {term: count / total_terms for term, count in term_counts.items()}
tf

{'it': 0.043478260869565216,
 'is': 0.043478260869565216,
 'a': 0.17391304347826086,
 'truth': 0.043478260869565216,
 'universally': 0.043478260869565216,
 'acknowledged': 0.043478260869565216,
 'that': 0.043478260869565216,
 'single': 0.043478260869565216,
 'man': 0.043478260869565216,
 'in': 0.08695652173913043,
 'possession': 0.043478260869565216,
 'of': 0.08695652173913043,
 'good': 0.043478260869565216,
 'fortune': 0.043478260869565216,
 'must': 0.043478260869565216,
 'be': 0.043478260869565216,
 'want': 0.043478260869565216,
 'wife': 0.043478260869565216}

In [43]:
# Inverse Document Frequency.
N = 1
idf = {term: math.log(N + 1) + 1 for term, count in term_counts.items()}
idf

{'it': 1.6931471805599454,
 'is': 1.6931471805599454,
 'a': 1.6931471805599454,
 'truth': 1.6931471805599454,
 'universally': 1.6931471805599454,
 'acknowledged': 1.6931471805599454,
 'that': 1.6931471805599454,
 'single': 1.6931471805599454,
 'man': 1.6931471805599454,
 'in': 1.6931471805599454,
 'possession': 1.6931471805599454,
 'of': 1.6931471805599454,
 'good': 1.6931471805599454,
 'fortune': 1.6931471805599454,
 'must': 1.6931471805599454,
 'be': 1.6931471805599454,
 'want': 1.6931471805599454,
 'wife': 1.6931471805599454}