In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

def cal_cosine_similarity(tfidf_vector1, tfidf_vector2):
    tfidf_vector1 = tfidf_vector1.reshape(1, -1)
    tfidf_vector2 = tfidf_vector2.reshape(1, -1)

    similarity = cosine_similarity(tfidf_vector1, tfidf_vector2)

    return similarity[0, 0]

def similarity_search(input_document, documents, vectorizer):
    input_tfidf = vectorizer.transform([input_document])
    documents_tfidf = vectorizer.transform(documents)

    similarities = []
    for doc_tfidf in documents_tfidf:
        similarity = cal_cosine_similarity(input_tfidf, doc_tfidf)
        similarities.append(similarity)

    ranked_indices = sorted(range(len(similarities)), key=lambda k: similarities[k], reverse=True)

    ranked_documents = [(documents[i], similarities[i]) for i in ranked_indices]

    return ranked_documents


newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

tfidf_vectorizer = TfidfVectorizer(stop_words='english')

tfidf_matrix = pd.DataFrame(tfidf_vectorizer.fit_transform(newsgroups.data).toarray(), columns = tfidf_vectorizer.get_feature_names_out())
print(tfidf_matrix)

input_doc = "Science."

result = similarity_search(input_doc, newsgroups.data, tfidf_vectorizer)

for i, (doc, similarity) in enumerate(result[:5]):
    print(f"Rank {i+1}: Similarity = {similarity:.4f}\n{doc}\n")

        00  000  0000  00000  000000  00000000  0000000004  00000000b  \
0      0.0  0.0   0.0    0.0     0.0       0.0         0.0        0.0   
1      0.0  0.0   0.0    0.0     0.0       0.0         0.0        0.0   
2      0.0  0.0   0.0    0.0     0.0       0.0         0.0        0.0   
3      0.0  0.0   0.0    0.0     0.0       0.0         0.0        0.0   
4      0.0  0.0   0.0    0.0     0.0       0.0         0.0        0.0   
...    ...  ...   ...    ...     ...       ...         ...        ...   
18841  0.0  0.0   0.0    0.0     0.0       0.0         0.0        0.0   
18842  0.0  0.0   0.0    0.0     0.0       0.0         0.0        0.0   
18843  0.0  0.0   0.0    0.0     0.0       0.0         0.0        0.0   
18844  0.0  0.0   0.0    0.0     0.0       0.0         0.0        0.0   
18845  0.0  0.0   0.0    0.0     0.0       0.0         0.0        0.0   

       00000001  00000001b  ...  zzs  zzvsi  zzy_3w  zzz  zzzoh  zzzzzz  \
0           0.0        0.0  ...  0.0    0.0     