Imports

In [60]:
import math
from collections import Counter
import re
import pandas as pd


Database:

In [61]:
example_text = "Cats are loved by everyone"

example_text2 = "I love cats so much"

example_text3 = "OMG cats are amazing, I love them"

database = [
    example_text,
    example_text2,
    example_text3
]

example_text4 = "Cats are amazing creatures, enchanting us with their graceful movements and independent personalities. Their soft purrs and gentle nuzzles provide comfort, fostering a unique bond with their human companions. With keen senses, agile leaps, and adorable antics, cats bring joy to homes worldwide. Their mysterious charm captivates hearts, making them truly extraordinary."

TF-IDF

There are multiple ways to calculate the TF(term frequency)

I chose to use the Logarithmically scaled frequency

IDF - inverse document frequency
- looks ar common/uncommon words - corrects for words like as, of, the etc
- minimize the weighting of frequent terms while making infrequent terms have a higher impact.


In [62]:
def log_scaled_tf(raw_tf):
    # Adding 1 to avoid log(0) issue
    return 1 + math.log(raw_tf) if raw_tf > 0 else 0

def ltfidf(database):
    all_words = [re.findall(r'\b\w+\b', text.lower()) for text in database]
    
    words = [word for sublist in all_words for word in sublist]

    term_counts = Counter(words)
    
    total_terms = len(words)
    
    ltf = {term: log_scaled_tf(count) for term, count in term_counts.items()}
    
    term_document_count = Counter(term for doc in database for term in set(re.findall(r'\b\w+\b', doc.lower())))
    idf = {term: math.log(len(database) / (1 + term_document_count[term])) for term in ltf}

    ltf_idf = {term: ltf[term] * idf[term] for term in ltf}

    return ltf_idf

ltfidf_result = ltfidf(database)

for term, ltf_idf in ltfidf_result.items():
    print(f"Term: {term}, LTFIDF: {ltf_idf:.4f}")

print("==========")


Term: cats, LTFIDF: -0.6037
Term: are, LTFIDF: 0.0000
Term: loved, LTFIDF: 0.4055
Term: by, LTFIDF: 0.4055
Term: everyone, LTFIDF: 0.4055
Term: i, LTFIDF: 0.0000
Term: love, LTFIDF: 0.0000
Term: so, LTFIDF: 0.4055
Term: much, LTFIDF: 0.4055
Term: omg, LTFIDF: 0.4055
Term: amazing, LTFIDF: 0.4055
Term: them, LTFIDF: 0.4055


And then I decided that the simple TF should also work

In [63]:
def tfidf(database):
    all_words = [re.findall(r'\b\w+\b', text.lower()) for text in database]
    
    words = [word for sublist in all_words for word in sublist]

    term_counts = Counter(words)
    
    total_terms = len(words)
    
    #so it's basically how frequent the term is/sum of all terms in the document (example text)

    tf = {term: count / total_terms for term, count in term_counts.items()}
        
    #counts each unique term across all documents in database
    term_document_count = Counter(term for examples in database for term in set(re.findall(r'\b\w+\b', examples.lower())))
    
    #take the log of the number of documents over the set of documents that have the term term
    idf = {term: math.log(len(database) / (1 + term_document_count[term])) for term in tf}

    tf_idf = {term: tf[term] * idf[term] for term in tf}

    return tf_idf

tfidf_result = tfidf(database)

for term, tf_idf in tfidf_result.items():
    print(f"Term: {term}, TFIDF: {tf_idf:.4f}")


Term: cats, TFIDF: -0.0508
Term: are, TFIDF: 0.0000
Term: loved, TFIDF: 0.0239
Term: by, TFIDF: 0.0239
Term: everyone, TFIDF: 0.0239
Term: i, TFIDF: 0.0000
Term: love, TFIDF: 0.0000
Term: so, TFIDF: 0.0239
Term: much, TFIDF: 0.0239
Term: omg, TFIDF: 0.0239
Term: amazing, TFIDF: 0.0239
Term: them, TFIDF: 0.0239


And then I discovered that you can do all these using TfidfVectorizer that basically doesn everything you need but in simpler t

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [65]:
tfidf = TfidfVectorizer()
result = tfidf.fit_transform(database)

print('\nidf values:')
feature_names = tfidf.get_feature_names_out()
for feature, idf_value in zip(feature_names, tfidf.idf_):
    print(feature, ':', idf_value)


idf values:
amazing : 1.6931471805599454
are : 1.2876820724517808
by : 1.6931471805599454
cats : 1.0
everyone : 1.6931471805599454
love : 1.2876820724517808
loved : 1.6931471805599454
much : 1.6931471805599454
omg : 1.6931471805599454
so : 1.6931471805599454
them : 1.6931471805599454


In [66]:
print('\nWord indexes:')
sorted_tfidf_vocabulary = sorted(tfidf.vocabulary_.items(), key=lambda x: x[1])
print(sorted_tfidf_vocabulary)


Word indexes:
[('amazing', 0), ('are', 1), ('by', 2), ('cats', 3), ('everyone', 4), ('love', 5), ('loved', 6), ('much', 7), ('omg', 8), ('so', 9), ('them', 10)]


In [67]:
print('\ntf-idf value:')
print(result)

#(a, b) = a is document index, b is word index


tf-idf value:
  (0, 4)	0.5046113401371842
  (0, 2)	0.5046113401371842
  (0, 6)	0.5046113401371842
  (0, 1)	0.3837699307603192
  (0, 3)	0.2980315863446099
  (1, 7)	0.5844829010200651
  (1, 9)	0.5844829010200651
  (1, 5)	0.444514311537431
  (1, 3)	0.34520501686496574
  (2, 10)	0.4711101009983051
  (2, 0)	0.4711101009983051
  (2, 8)	0.4711101009983051
  (2, 5)	0.35829137488557944
  (2, 1)	0.35829137488557944
  (2, 3)	0.2782452148327134


In [68]:
print('\ntf-idf values with word indices:')
feature_names = tfidf.get_feature_names_out()

for i, doc in enumerate(result):
    for j, value in zip(doc.indices, doc.data):
        word = feature_names[j]
        print(f"  ({i + 1}, {word}:{j}) {value:.4f}")


tf-idf values with word indices:
  (1, everyone:4) 0.5046
  (1, by:2) 0.5046
  (1, loved:6) 0.5046
  (1, are:1) 0.3838
  (1, cats:3) 0.2980
  (2, much:7) 0.5845
  (2, so:9) 0.5845
  (2, love:5) 0.4445
  (2, cats:3) 0.3452
  (3, them:10) 0.4711
  (3, amazing:0) 0.4711
  (3, omg:8) 0.4711
  (3, love:5) 0.3583
  (3, are:1) 0.3583
  (3, cats:3) 0.2782


In [70]:
print('\ntf-idf values in matrix form:')
print(result.toarray())


tf-idf values in matrix form:
[[0.         0.38376993 0.50461134 0.29803159 0.50461134 0.
  0.50461134 0.         0.         0.         0.        ]
 [0.         0.         0.         0.34520502 0.         0.44451431
  0.         0.5844829  0.         0.5844829  0.        ]
 [0.4711101  0.35829137 0.         0.27824521 0.         0.35829137
  0.         0.         0.4711101  0.         0.4711101 ]]


So basically, each row is a document(example text), and each column is a word from the corpus - we have 11 words for now so wooohoooo