In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


docs = [
    "it is a good day, I like to stay here",
    "I am happy to be here",
    "I am bob",
    "it is sunny today",
    "I have a party today",
    "it is a dog and that is a cat",
    "there are dog and cat on the tree",
    "I study hard this morning",
    "today is a good day",
    "tomorrow will be a good day",
    "I like coffee, I like book and I like apple",
    "I do not like it",
    "I am kitty, I like bob",
    "I do not care who like bob, but I like kitty",
    "It is coffee time, bring your cup",
]

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(docs)
print("idf: ", [(n, idf) for idf, n in zip(vectorizer.idf_, vectorizer.get_feature_names_out())])
print("v2i: ", vectorizer.vocabulary_)


idf:  [('am', 2.386294361119891), ('and', 2.386294361119891), ('apple', 3.0794415416798357), ('are', 3.0794415416798357), ('be', 2.6739764335716716), ('bob', 2.386294361119891), ('book', 3.0794415416798357), ('bring', 3.0794415416798357), ('but', 3.0794415416798357), ('care', 3.0794415416798357), ('cat', 2.6739764335716716), ('coffee', 2.6739764335716716), ('cup', 3.0794415416798357), ('day', 2.386294361119891), ('do', 2.6739764335716716), ('dog', 2.6739764335716716), ('good', 2.386294361119891), ('happy', 3.0794415416798357), ('hard', 3.0794415416798357), ('have', 3.0794415416798357), ('here', 2.6739764335716716), ('is', 1.9808292530117262), ('it', 1.9808292530117262), ('kitty', 2.6739764335716716), ('like', 1.9808292530117262), ('morning', 3.0794415416798357), ('not', 2.6739764335716716), ('on', 3.0794415416798357), ('party', 3.0794415416798357), ('stay', 3.0794415416798357), ('study', 3.0794415416798357), ('sunny', 3.0794415416798357), ('that', 3.0794415416798357), ('the', 3.0794415

In [10]:
vectorizer.idf_

array([2.38629436, 2.38629436, 3.07944154, 3.07944154, 2.67397643,
       2.38629436, 3.07944154, 3.07944154, 3.07944154, 3.07944154,
       2.67397643, 2.67397643, 3.07944154, 2.38629436, 2.67397643,
       2.67397643, 2.38629436, 3.07944154, 3.07944154, 3.07944154,
       2.67397643, 1.98082925, 1.98082925, 2.67397643, 1.98082925,
       3.07944154, 2.67397643, 3.07944154, 3.07944154, 3.07944154,
       3.07944154, 3.07944154, 3.07944154, 3.07944154, 3.07944154,
       3.07944154, 3.07944154, 2.67397643, 2.38629436, 3.07944154,
       3.07944154, 3.07944154, 3.07944154, 3.07944154])

In [13]:
vectorizer.get_feature_names_out()

array(['am', 'and', 'apple', 'are', 'be', 'bob', 'book', 'bring', 'but',
       'care', 'cat', 'coffee', 'cup', 'day', 'do', 'dog', 'good',
       'happy', 'hard', 'have', 'here', 'is', 'it', 'kitty', 'like',
       'morning', 'not', 'on', 'party', 'stay', 'study', 'sunny', 'that',
       'the', 'there', 'this', 'time', 'to', 'today', 'tomorrow', 'tree',
       'who', 'will', 'your'], dtype=object)

In [6]:

q = "I get a coffee cup"
qtf_idf = vectorizer.transform([q])
res = cosine_similarity(tf_idf, qtf_idf)
res = res.ravel().argsort()[-3:]
res

array([13, 10, 14], dtype=int64)

In [8]:
res[::-1]

array([14, 10, 13], dtype=int64)

In [9]:
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]]))



top 3 docs for 'I get a coffee cup':
['It is coffee time, bring your cup', 'I like coffee, I like book and I like apple', 'I do not care who like bob, but I like kitty']


In [3]:

q = "I get a coffee cup"
qtf_idf = vectorizer.transform([q])
res = cosine_similarity(tf_idf, qtf_idf)
res = res.ravel().argsort()[-3:]
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]]))


i2v = {i: v for v, i in vectorizer.vocabulary_.items()}
dense_tfidf = tf_idf.todense()


top 3 docs for 'I get a coffee cup':
['It is coffee time, bring your cup', 'I like coffee, I like book and I like apple', 'I do not care who like bob, but I like kitty']
