In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.tokenize import word_tokenize


In [7]:
corpus = [
    'The quick brown fox jumped over the lazy dog.',
    'The dog barked at the fox.',
    'Foxes are clever animals.',
]

In [11]:
vectorizer = TfidfVectorizer()

In [64]:
matrix = vectorizer.fit_transform(corpus)

In [66]:
matrix.shape

(3, 14)

In [23]:
features = vectorizer.get_feature_names_out()

In [27]:
len(features)

14

In [50]:
test_set = set()
for doc in corpus:
    tokens = word_tokenize(doc)
    for token in tokens:
        token = token.lower()
        test_set.add(token)
        
print(len(test_set))
    

15


In [52]:
features

array(['animals', 'are', 'at', 'barked', 'brown', 'clever', 'dog', 'fox',
       'foxes', 'jumped', 'lazy', 'over', 'quick', 'the'], dtype=object)

In [54]:
test_set

{'.',
 'animals',
 'are',
 'at',
 'barked',
 'brown',
 'clever',
 'dog',
 'fox',
 'foxes',
 'jumped',
 'lazy',
 'over',
 'quick',
 'the'}

In [56]:
not_included = []
for item in test_set:
    if item not in features:
        not_included.append(item)

In [58]:
not_included

['.']

In [69]:
cos_sim = cosine_similarity(np.transpose(matrix))

In [71]:
cos_sim.shape

(14, 14)

In [83]:
highest_similarity = np.max(cos_sim)
highest_similarity

np.float64(1.0)

In [85]:
location = np.where(cos_sim == highest_similarity)

In [87]:
location

(array([ 0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  3,  3,  4,  4,  4,  4,  4,
         5,  5,  5,  5,  8,  8,  8,  8,  9,  9,  9,  9,  9, 10, 10, 10, 10,
        10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12]),
 array([ 0,  1,  5,  8,  0,  1,  5,  8,  2,  3,  2,  3,  4,  9, 10, 11, 12,
         0,  1,  5,  8,  0,  1,  5,  8,  4,  9, 10, 11, 12,  4,  9, 10, 11,
        12,  4,  9, 10, 11, 12,  4,  9, 10, 11, 12]))

In [89]:
features[0], features[1]

('animals', 'are')

In [94]:
cos_sim[0, 1]

np.float64(1.0)