In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk


In [None]:
folder_path = "./docs/"

# To handle empty input
while True:
    try:
        query = input("Enter your query:")
        if not query.strip():
            raise ValueError("Query cannot be empty.")
        break
    except ValueError as error:
        print(f"Error: {error}")

# Load English stop words
stop_words = set(stopwords.words("english"))


In [None]:
documents = []
file_paths = []

for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                document = f.read()
                words = word_tokenize(document)
                filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
                filtered_document = ' '.join(filtered_words)
                documents.append(filtered_document)
                file_paths.append(file)

In [None]:
vectorizer = CountVectorizer(lowercase=True)
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(vectorizer.fit_transform(documents))

query_words = word_tokenize(query.lower())
filtered_query = ' '.join([word for word in query_words if word not in stop_words])
query_vector = tfidf_transformer.transform(vectorizer.transform([filtered_query]))

similarities = cosine_similarity(tfidf_matrix, query_vector)
top_indices = similarities.flatten().argsort()[::-1][:5]

In [1]:
print(f"Top 5 documents most similar to the entered query: {query}\n")

counter = 1
for i, idx in enumerate(top_indices):
    print(f"Rank {counter}:")
    print("Document:", os.path.basename(file_paths[idx]))
    print("Similarity:", "{:.2%}".format(similarities[idx][0]))
    print("\n")
    counter += 1
    if counter > 5:
        break


Top 5 documents most similar to the entered query: Machine

Rank 1:
Document: doc_2.txt
Similarity: 15.32%


Rank 2:
Document: doc_4.txt
Similarity: 13.22%


Rank 3:
Document: doc_1.txt
Similarity: 8.02%


Rank 4:
Document: doc_5.txt
Similarity: 0.00%


Rank 5:
Document: doc_3.txt
Similarity: 0.00%


