In [1]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
%run ../utils/search_engine.ipynb

In [3]:
# Load preprocessed data
input_file = "preprocessed_data.json"
documents = load_preprocessed_data(input_file)

In [4]:
# Extract document names and content
document_names = [doc["document_name"] for doc in documents]
document_contents = [doc["content"] for doc in documents]

In [6]:
# Create TF-IDF vectorizer and fit to document content
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(document_contents)
print(tfidf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10302 stored elements and shape (8, 5314)>
  Coords	Values
  (0, 3027)	0.0030361640834519984
  (0, 4111)	0.022366899190037207
  (0, 2832)	0.04042877195894201
  (0, 2699)	0.08409184567459937
  (0, 256)	0.005786993814620057
  (0, 641)	0.7100629399172355
  (0, 3728)	0.08332371233722662
  (0, 3629)	0.1321546166330675
  (0, 4840)	0.007245540203237096
  (0, 5034)	0.007859881469964343
  (0, 2962)	0.02756553482617432
  (0, 4781)	0.0054310116424548275
  (0, 2469)	0.0032343017567153607
  (0, 2408)	0.0202544783511702
  (0, 4736)	0.005786993814620057
  (0, 2607)	0.009108492250355996
  (0, 2920)	0.004594255804362386
  (0, 4240)	0.0032343017567153607
  (0, 1592)	0.0032343017567153607
  (0, 5041)	0.008085754391788403
  (0, 280)	0.011485639510905964
  (0, 1798)	0.0030361640834519984
  (0, 510)	0.003622770101618548
  (0, 1927)	0.003622770101618548
  (0, 519)	0.012144656333807994
  :	:
  (7, 646)	0.02258322799434671
  (7, 5120)	0.022583227994

In [15]:
# Example: Process user query
user_query = input("Enter your search query: ")
processed_query = preprocess_query(user_query)

In [16]:
# Transform the processed query into TF-IDF vector
query_vector = vectorizer.transform([processed_query])
print(query_vector)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2 stored elements and shape (1, 5314)>
  Coords	Values
  (0, 305)	0.7457783383592639
  (0, 2468)	0.6661941684179585


In [17]:
# Compute cosine similarity between the query and documents
similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
print(similarity_scores)

[0.00215467 0.17720284 0.12500572 0.02352262 0.01540357 0.00549774
 0.00436192 0.        ]


In [18]:
# Rank documents by similarity scores
ranked_indices = np.argsort(similarity_scores)[::-1]
ranked_documents = [(document_names[i], similarity_scores[i]) for i in ranked_indices if similarity_scores[i] > 0]
print(ranked_documents)

[('AI technologies for education_ Recent research & future directions.pdf', 0.17720284018134436), ('Complexity - 2021 - Zhai - A Review of Artificial Intelligence  AI  in Education from 2010 to 2020.pdf', 0.1250057195476056), ('Machine-Learning-Algorithms-A-Review.pdf', 0.02352262194864478), ('A_Survey_on_Big_Data_Analytics_Challenge.pdf', 0.015403573302516054), ('CURMay06.pdf', 0.005497741764435141), ('grossman98-Data-minin-research-opportunities.pdf', 0.0043619191369147865), ('1-s2.0-S2001037014000464-main.pdf', 0.0021546729692276767)]


In [19]:
# Display ranked document names and scores
if ranked_documents:
    print("Ranked Documents:")
    for rank, (doc_name, score) in enumerate(ranked_documents, start=1):
        print(f"{rank}. {doc_name} (Score: {score:.4f})")
else:
    print("No relevant documents found.")

Ranked Documents:
1. AI technologies for education_ Recent research & future directions.pdf (Score: 0.1772)
2. Complexity - 2021 - Zhai - A Review of Artificial Intelligence  AI  in Education from 2010 to 2020.pdf (Score: 0.1250)
3. Machine-Learning-Algorithms-A-Review.pdf (Score: 0.0235)
4. A_Survey_on_Big_Data_Analytics_Challenge.pdf (Score: 0.0154)
5. CURMay06.pdf (Score: 0.0055)
6. grossman98-Data-minin-research-opportunities.pdf (Score: 0.0044)
7. 1-s2.0-S2001037014000464-main.pdf (Score: 0.0022)
