In [1]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

True

In [2]:
import string
import math
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
with open('document.txt', 'r') as file:
  text = file.read()

print(text)

The cat sat on the mat.
The dog ran after the cat.
The bird flew in the sky.


In [4]:
# 1) tokenization (In-built are nltk.word_tokenize())
def tokenize(text):
  words = text.split()
  tokens = []
  for word in words:
    word = word.translate(str.maketrans('', '', string.punctuation))
    if word:
      tokens.append(word.lower())
  return tokens


word_tokens = tokenize(text)
print(word_tokens)

['the', 'cat', 'sat', 'on', 'the', 'mat', 'the', 'dog', 'ran', 'after', 'the', 'cat', 'the', 'bird', 'flew', 'in', 'the', 'sky']


In [5]:
# 2) POS Tag
print("Parts of speech : ", nltk.pos_tag(word_tokens))

Parts of speech :  [('the', 'DT'), ('cat', 'NN'), ('sat', 'VBD'), ('on', 'IN'), ('the', 'DT'), ('mat', 'NN'), ('the', 'DT'), ('dog', 'NN'), ('ran', 'VBD'), ('after', 'IN'), ('the', 'DT'), ('cat', 'NN'), ('the', 'DT'), ('bird', 'NN'), ('flew', 'VBD'), ('in', 'IN'), ('the', 'DT'), ('sky', 'NN')]


In [6]:
# 3 Stop Words Removal
stop_words = nltk.corpus.stopwords.words('english')

def remove_stopWords(tokens, sw):
  filtered_tokens = [word for word in tokens if word not in sw]
  return filtered_tokens

filtered_tokens = remove_stopWords(word_tokens, stop_words)
print(filtered_tokens)

['cat', 'sat', 'mat', 'dog', 'ran', 'cat', 'bird', 'flew', 'sky']


In [7]:
# 4) Stemming
ps = nltk.PorterStemmer()

def apply_stemming(tokens):
  stemmed_tokens = [ps.stem(word) for word in tokens]
  return stemmed_tokens

stemmed_tokens = apply_stemming(filtered_tokens)
print(stemmed_tokens)


['cat', 'sat', 'mat', 'dog', 'ran', 'cat', 'bird', 'flew', 'sky']


In [8]:
# 5 Lemmatization
lemmatizer = nltk.WordNetLemmatizer()

def apply_lemmatization(tokens):
  lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
  return lemmatized_tokens

lemmatized_tokens = apply_lemmatization(filtered_tokens)
print(lemmatized_tokens)

['cat', 'sat', 'mat', 'dog', 'ran', 'cat', 'bird', 'flew', 'sky']


In [9]:
# 6) Calculate TF
tf = {}
total_terms = len(lemmatized_tokens)
for term in lemmatized_tokens:
  tf[term] = tf.get(term, 0) + 1

tf_normalized = {term: freq / total_terms for term, freq in tf.items()}

# 7) Calculate IDF
document_frequency = {}
for term in set(lemmatized_tokens):
  document_frequency[term] = sum(1 for doc_tokens in [lemmatized_tokens] if term in doc_tokens)

idf = {term : math.log10(len([lemmatized_tokens]) / df) for term, df in document_frequency.items()}

tfidf = {term : tf_normalized[term] * idf[term] for term in tf_normalized}

for term in tfidf:
  print(f"{term}: TF = {tf_normalized[term]}, IDF = {idf[term]}, TF-IDF = {tfidf[term]}")

cat: TF = 0.2222222222222222, IDF = 0.0, TF-IDF = 0.0
sat: TF = 0.1111111111111111, IDF = 0.0, TF-IDF = 0.0
mat: TF = 0.1111111111111111, IDF = 0.0, TF-IDF = 0.0
dog: TF = 0.1111111111111111, IDF = 0.0, TF-IDF = 0.0
ran: TF = 0.1111111111111111, IDF = 0.0, TF-IDF = 0.0
bird: TF = 0.1111111111111111, IDF = 0.0, TF-IDF = 0.0
flew: TF = 0.1111111111111111, IDF = 0.0, TF-IDF = 0.0
sky: TF = 0.1111111111111111, IDF = 0.0, TF-IDF = 0.0
