In [2]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

# Load spaCy's English tokenizer
nlp = spacy.load("en_core_web_sm")

# Sample documents stored in a dictionary
documents = {
    "doc1": "This is the first document.",
    "doc2": "This document is the second document.",
    "doc3": "And this is the third one.",
    "doc4": "Is this the first document?",
}

# Preprocessing function to tokenize and remove stopwords/punctuation using spaCy
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.text not in nlp.Defaults.stop_words and token.text not in string.punctuation]
    return " ".join(tokens)

# Preprocess documents and store them in a dictionary
preprocessed_docs = {doc_id: preprocess_text(doc) for doc_id, doc in documents.items()}

# Construct TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_docs.values())

# Calculate cosine similarity between query and documents
def search(query, tfidf_matrix, tfidf_vectorizer):
    preprocessed_query = preprocess_text(query)
    query_vector = tfidf_vectorizer.transform([preprocessed_query])

    # Calculate cosine similarity between query and documents
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

    # Sort documents based on similarity scores
    sorted_indexes = similarity_scores.argsort()[0][::-1]

    # Return sorted documents along with their similarity scores
    results = [(list(preprocessed_docs.keys())[i], list(documents.values())[i], similarity_scores[0, i]) for i in sorted_indexes]
    return results

# Get input from user
query = input("Enter your query: ")

# Perform search
search_results = search(query, tfidf_matrix, tfidf_vectorizer)

# Display search results
print("Query:", query)
for i, result in enumerate(search_results, start=1):
    print(f"\nRank: {i}")
    print("Document ID:", result[0])
    print("Document:", result[1])
    print("Similarity Score:", result[2])
    print("----------------------")

# Get the highest rank cosine score
highest_rank_score = max(result[2] for result in search_results)
print("The highest rank cosine score is:", highest_rank_score)

Enter your query: first document
Query: first document

Rank: 1
Document ID: doc4
Document: Is this the first document?
Similarity Score: 1.0
----------------------

Rank: 2
Document ID: doc1
Document: This is the first document.
Similarity Score: 1.0
----------------------

Rank: 3
Document ID: doc2
Document: This document is the second document.
Similarity Score: 0.78722297610404
----------------------

Rank: 4
Document ID: doc3
Document: And this is the third one.
Similarity Score: 0.0
----------------------
The highest rank cosine score is: 1.0
