In [3]:
import nltk
import numpy as np
import re

# Input sentences
sentence1 = "The cat is sleeping on the warm couch"
sentence2 = "I enjoy baking cookies on a cold afternoon"
sentence3 = "Tomorrow we will go hiking in the mountains"
text_corpus = [sentence1, sentence2, sentence3]

# Preprocess the text
for i in range(len(text_corpus)):
    text_corpus[i] = text_corpus[i].lower()
    text_corpus[i] = re.sub(r'\W', ' ', text_corpus[i])
    text_corpus[i] = re.sub(r'\s+', ' ', text_corpus[i])

# Calculate word frequencies
word_frequency = {}
for line in text_corpus:
    words = nltk.word_tokenize(line)
    for word in words:
        if word not in word_frequency:
            word_frequency[word] = 1
        else:
            word_frequency[word] += 1

# Select the most frequent words
import heapq
top_words = heapq.nlargest(8, word_frequency, key=word_frequency.get)

print("Top Words:", top_words)

# Calculate Term Frequency (TF)
term_frequency = {}
for word in top_words:
    tf_vector = []
    for doc in text_corpus:
        term_count = 0
        for term in nltk.word_tokenize(doc):
            if word == term:
                term_count += 1
        tf_score = term_count / len(nltk.word_tokenize(doc))
        tf_vector.append(tf_score)
    term_frequency[word] = tf_vector

print("Term Frequency:")
print(term_frequency)

# Calculate Inverse Document Frequency (IDF)
inverse_document_frequency = {}
for word in top_words:
    doc_count = 0
    for doc in text_corpus:
        if word in nltk.word_tokenize(doc):
            doc_count += 1
    inverse_document_frequency[word] = np.log(len(text_corpus) / (doc_count))

print("Inverse Document Frequency:")
print(inverse_document_frequency)

# Calculate TF-IDF
tfidf_scores = []
for word in term_frequency.keys():
    tfidf_vector = []
    for tf_score in term_frequency[word]:
        tfidf_value = tf_score * inverse_document_frequency[word]
        tfidf_vector.append(tfidf_value)
    tfidf_scores.append(tfidf_vector)

print("TF-IDF Values:")
print(tfidf_scores)


Top Words: ['the', 'on', 'cat', 'is', 'sleeping', 'warm', 'couch', 'i']
Term Frequency:
{'the': [0.25, 0.0, 0.125], 'on': [0.125, 0.125, 0.0], 'cat': [0.125, 0.0, 0.0], 'is': [0.125, 0.0, 0.0], 'sleeping': [0.125, 0.0, 0.0], 'warm': [0.125, 0.0, 0.0], 'couch': [0.125, 0.0, 0.0], 'i': [0.0, 0.125, 0.0]}
Inverse Document Frequency:
{'the': 0.4054651081081644, 'on': 0.4054651081081644, 'cat': 1.0986122886681098, 'is': 1.0986122886681098, 'sleeping': 1.0986122886681098, 'warm': 1.0986122886681098, 'couch': 1.0986122886681098, 'i': 1.0986122886681098}
TF-IDF Values:
[[0.1013662770270411, 0.0, 0.05068313851352055], [0.05068313851352055, 0.05068313851352055, 0.0], [0.13732653608351372, 0.0, 0.0], [0.13732653608351372, 0.0, 0.0], [0.13732653608351372, 0.0, 0.0], [0.13732653608351372, 0.0, 0.0], [0.13732653608351372, 0.0, 0.0], [0.0, 0.13732653608351372, 0.0]]
