### Create a program for Index Elimination - Only consider high-idf query terms.

In [1]:
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# document scope

documents =[
    "This is document 1. It contains some text for testing.",
    "Document 2 has different content for testing purposes.",
    "The third document is here with unique words."
]

# querry

query = "This is a test query for information retrieval"

In [7]:
document_tokenized = [word_tokenize(i)for i in documents ]
print(document_tokenized)

query_tokenized = word_tokenize(query)
print(query_tokenized)

[['This', 'is', 'document', '1', '.', 'It', 'contains', 'some', 'text', 'for', 'testing', '.'], ['Document', '2', 'has', 'different', 'content', 'for', 'testing', 'purposes', '.'], ['The', 'third', 'document', 'is', 'here', 'with', 'unique', 'words', '.']]
['This', 'is', 'a', 'test', 'query', 'for', 'information', 'retrieval']


In [10]:
v = TfidfVectorizer()
doc_matrix = v.fit_transform([' '.join(d) for d in document_tokenized])


In [15]:
query_matrix = v.transform([' '.join(query_tokenized)])
print(query_matrix)

  (0, 15)	0.680918560398684
  (0, 7)	0.5178561161676974
  (0, 4)	0.5178561161676974


In [21]:
cos_simi = np.dot(query_matrix , doc_matrix.T).toarray()[0]
print(cos_simi)

[0.55177848 0.16784936 0.1496385 ]


In [24]:
sort = np.argsort(cos_simi)[::-1]
sort

array([0, 1, 2], dtype=int64)

In [27]:
tfidf_threshold = 0.2
selected_query_terms =[term for term,tfidf in zip(v.get_feature_names_out(),query_matrix.toarray()[0]) if tfidf>= tfidf_threshold]
print(selected_query_terms)

['for', 'is', 'this']


### Create a program for Index Elimination - Only consider docs containing many query terms (Jaccard Coefficient).

In [29]:
# document scope

documents =[
    "This is document 1. It contains some text for testing.",
    "Document 2 has different content for testing purposes.",
    "The third document is here with unique words."
]

# querry

query = "This is a test query for information retrieval"

In [45]:
doc_tokenized = [set(word_tokenize(d)) for d in documents]
que_tokenized = set(word_tokenize(query))



In [48]:
jaccard_simi = [len(que_tokenized.intersection(d)) / len(que_tokenized.union(d)) for d in doc_tokenized]
jaccard_simi

[0.1875, 0.0625, 0.0625]

In [51]:
sort_j = np.argsort(jaccard_simi)[::-1]
sort_j

array([0, 2, 1], dtype=int64)

In [57]:
jaccard_threshold = 0.1
selected_doc = [d for d , s in zip(documents ,jaccard_simi) if s > jaccard_threshold]
selected_doc

['This is document 1. It contains some text for testing.']

### Champion List

In [None]:
# document scope

documents =[
    "This is document 1. It contains some text for testing.",
    "Document 2 has different content for testing purposes.",
    "The third document is here with unique words."
]

# querry

query = "This is a test query for information retrieval"

In [None]:
doc_tokenized = [set(word_tokenize(d)) for d in documents]
que_tokenized = set(word_tokenize(query))


In [62]:
v = TfidfVectorizer()
d_matrix = v.fit_transform([' '.join(d) for d in doc_tokenized ])
print(d_matrix)

  (0, 7)	0.28574186296253085
  (0, 4)	0.28574186296253085
  (0, 3)	0.221904046872743
  (0, 10)	0.3757162113174268
  (0, 12)	0.3757162113174268
  (0, 8)	0.3757162113174268
  (0, 0)	0.3757162113174268
  (0, 15)	0.3757162113174268
  (0, 11)	0.28574186296253085
  (1, 1)	0.42618350336974425
  (1, 5)	0.42618350336974425
  (1, 2)	0.42618350336974425
  (1, 9)	0.42618350336974425
  (1, 4)	0.3241235393856436
  (1, 3)	0.2517108425440014
  (1, 11)	0.3241235393856436
  (2, 16)	0.3799446164315741
  (2, 18)	0.3799446164315741
  (2, 17)	0.3799446164315741
  (2, 13)	0.3799446164315741
  (2, 6)	0.3799446164315741
  (2, 14)	0.3799446164315741
  (2, 7)	0.28895767404089806
  (2, 3)	0.22440141104916914


In [67]:
q_matrix = v.transform([" ".join(que_tokenized)])
print(q_matrix)

  (0, 15)	0.680918560398684
  (0, 7)	0.5178561161676974
  (0, 4)	0.5178561161676974


In [71]:
cs = list(cosine_similarity(q_matrix , d_matrix)[0])
cs

[0.5517784844893074, 0.16784935726477715, 0.149638498815671]

In [76]:
champion_list = {}

for i , term in enumerate(que_tokenized):
    term_doc = doc_matrix[:,i].toarray().flatten()
    cl = list(np.argsort(term_doc)[::-1][:3])
    champion_list[term] = cl
print(champion_list)                                   

{'This': [0, 2, 1], 'query': [1, 2, 0], 'information': [1, 2, 0], 'a': [1, 2, 0], 'test': [1, 0, 2], 'retrieval': [1, 2, 0], 'for': [2, 1, 0], 'is': [2, 0, 1]}


In [80]:
selected_docs = set()
for term in que_tokenized:
    selected_docs.update(champion_list.get(term,[]))
    
print(selected_docs)

{0, 1, 2}
