In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [2]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Define the sample sentence
sentence = "I am currently reading a book about the importance of learning, even though it's a bit challenging with all the distractions and numbers of pages."


Tokenization

In [4]:
# Tokenization
tokens = word_tokenize(sentence)
print("Tokenization:", tokens)

Tokenization: ['I', 'am', 'currently', 'reading', 'a', 'book', 'about', 'the', 'importance', 'of', 'learning', ',', 'even', 'though', 'it', "'s", 'a', 'bit', 'challenging', 'with', 'all', 'the', 'distractions', 'and', 'numbers', 'of', 'pages', '.']


POS Tagging

In [5]:
# POS Tagging
pos_tags = pos_tag(tokens)
print("POS Tagging:", pos_tags)

POS Tagging: [('I', 'PRP'), ('am', 'VBP'), ('currently', 'RB'), ('reading', 'VBG'), ('a', 'DT'), ('book', 'NN'), ('about', 'IN'), ('the', 'DT'), ('importance', 'NN'), ('of', 'IN'), ('learning', 'NN'), (',', ','), ('even', 'RB'), ('though', 'IN'), ('it', 'PRP'), ("'s", 'VBZ'), ('a', 'DT'), ('bit', 'NN'), ('challenging', 'VBG'), ('with', 'IN'), ('all', 'PDT'), ('the', 'DT'), ('distractions', 'NNS'), ('and', 'CC'), ('numbers', 'NNS'), ('of', 'IN'), ('pages', 'NNS'), ('.', '.')]


stop words removal

In [6]:
# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens (after stop words removal):", filtered_tokens)

Filtered Tokens (after stop words removal): ['currently', 'reading', 'book', 'importance', 'learning', ',', 'even', 'though', "'s", 'bit', 'challenging', 'distractions', 'numbers', 'pages', '.']


Stemming 

In [7]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['current', 'read', 'book', 'import', 'learn', ',', 'even', 'though', "'s", 'bit', 'challeng', 'distract', 'number', 'page', '.']


Lemmatization

In [8]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: ['currently', 'reading', 'book', 'importance', 'learning', ',', 'even', 'though', "'s", 'bit', 'challenging', 'distraction', 'number', 'page', '.']


2 -> Create representation of documents by calculating Term Frequency and Inverse DocumentFrequency.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# Define the documents
documents = [
    "I am currently reading a book about the importance of learning, even though it's a bit challenging with all the distractions and numbers of pages.",
    "The quick brown fox jumped over the lazy dog.",
    "Dogs are loyal companions.",
    "The cat sat on the mat."
]

In [11]:
# Create a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents using TF-IDF Vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get the feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

In [12]:
# Print the TF-IDF representation of documents
for doc_index, doc in enumerate(documents):
    print(f"TF-IDF representation for Document {doc_index + 1}:")
    for term_index, term in enumerate(feature_names):
        tfidf_value = tfidf_matrix[doc_index, term_index]
        if tfidf_value > 0:
            print(f"{term}: {tfidf_value:.4f}")
    print()

TF-IDF representation for Document 1:
about: 0.2057
all: 0.2057
am: 0.2057
and: 0.2057
bit: 0.2057
book: 0.2057
challenging: 0.2057
currently: 0.2057
distractions: 0.2057
even: 0.2057
importance: 0.2057
it: 0.2057
learning: 0.2057
numbers: 0.2057
of: 0.4114
pages: 0.2057
reading: 0.2057
the: 0.2626
though: 0.2057
with: 0.2057

TF-IDF representation for Document 2:
brown: 0.3404
dog: 0.3404
fox: 0.3404
jumped: 0.3404
lazy: 0.3404
over: 0.3404
quick: 0.3404
the: 0.4346

TF-IDF representation for Document 3:
are: 0.5000
companions: 0.5000
dogs: 0.5000
loyal: 0.5000

TF-IDF representation for Document 4:
cat: 0.4215
mat: 0.4215
on: 0.4215
sat: 0.4215
the: 0.5380



In [13]:
# optional to above code 

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print(tfidf_matrix.toarray())

[[0.20571759 0.20571759 0.20571759 0.20571759 0.         0.20571759
  0.20571759 0.         0.         0.20571759 0.         0.20571759
  0.20571759 0.         0.         0.20571759 0.         0.20571759
  0.20571759 0.         0.         0.20571759 0.         0.
  0.20571759 0.41143519 0.         0.         0.20571759 0.
  0.20571759 0.         0.26261375 0.20571759 0.20571759]
 [0.         0.         0.         0.         0.         0.
  0.         0.34041103 0.         0.         0.         0.
  0.         0.34041103 0.         0.         0.34041103 0.
  0.         0.34041103 0.34041103 0.         0.         0.
  0.         0.         0.         0.34041103 0.         0.34041103
  0.         0.         0.4345599  0.         0.        ]
 [0.         0.         0.         0.         0.5        0.
  0.         0.         0.         0.         0.5        0.
  0.         0.         0.5        0.         0.         0.
  0.         0.         0.         0.         0.5        0.
  0.        

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the sentence
sentence = "I am currently doing practicals I"

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the sentence to calculate TF-IDF
tfidf_representation = tfidf_vectorizer.fit_transform([sentence])
print(tfidf_representation)

# Get the feature names (words) and their corresponding TF-IDF values
feature_names = tfidf_vectorizer.get_feature_names_out()
print(feature_names)

# Create a dictionary to store TF-IDF values
tfidf_dict = {}
for i, feature in enumerate(feature_names):
    tfidf_dict[feature] = tfidf_representation[0, i]

# Print the TF-IDF representation
print("TF-IDF representation:")
print(tfidf_dict)


  (0, 3)	0.5
  (0, 2)	0.5
  (0, 1)	0.5
  (0, 0)	0.5
['am' 'currently' 'doing' 'practicals']
TF-IDF representation:
{'am': 0.5, 'currently': 0.5, 'doing': 0.5, 'practicals': 0.5}
