In [20]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Sample Corpus
documents = [
    "Artificial Intelligence (AI) is the simulation of human intelligence processes by machines.",
    "Machine learning is a subset of AI that focuses on the development of algorithms and statistical models.",
    "Deep learning is a subfield of machine learning that uses artificial neural networks to model and solve complex tasks.",
    "Natural Language Processing (NLP) is a branch of AI that deals with the interaction between computers and human languages.",
    "AI technology is advancing rapidly and is being used in various fields, including healthcare, finance, and autonomous vehicles."
]

# Query about AI technology
query = "How is AI technology being applied in healthcare and finance?"


In [7]:
# tokenization

tokenized_docs = [word_tokenize(d) for d in documents]
tokenized_query = word_tokenize(query)



['How', 'is', 'AI', 'technology', 'being', 'applied', 'in', 'healthcare', 'and', 'finance', '?']


In [13]:
# tfidf vectorizer

v = TfidfVectorizer()

# for corpus

tfidf_matrix = v.fit_transform([" ".join(d) for d in tokenized_docs])
tfidf_query = v.transform([" ".join(tokenized_query)])



  (0, 47)	0.41303074817158975
  (0, 24)	0.19681136453924075
  (0, 20)	0.41303074817158975
  (0, 18)	0.41303074817158975
  (0, 16)	0.41303074817158975
  (0, 6)	0.41303074817158975
  (0, 3)	0.23269431891677766
  (0, 1)	0.23269431891677766


In [19]:
# cos similarity

cos_simi = cosine_similarity(tfidf_query , tfidf_matrix)
cosine_sim = [y for x in cos_simi for y in x]
print(cosine_sim)

[0.06903087898456996, 0.10575396603388984, 0.057649802224362647, 0.09607860571329427, 0.6721893629091699]


In [22]:
# sort similarities

sort = np.argsort(cosine_sim)[::-1]
print(sort)

[4 1 3 0 2]


In [27]:
champion_lists = {}
for i , term in enumerate(tokenized_query):
    term_documents = tfidf_matrix[:,i].toarray().flatten()
    champion_list = list(np.argsort(term_documents)[::-1][:3])
    champion_lists[term] = champion_list
    
    
print(champion_lists)

{'How': [4, 3, 2], 'is': [0, 1, 3], 'AI': [1, 4, 3], 'technology': [4, 1, 3], 'being': [0, 2, 4], 'applied': [4, 3, 2], 'in': [4, 3, 2], 'healthcare': [3, 4, 2], 'and': [3, 4, 2], 'finance': [0, 4, 3], '?': [2, 4, 3]}
