In [None]:
from sklearn.datasets import fetch_20newsgroups

# Download and load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Access the data and target
data = newsgroups.data
target = newsgroups.target
target_names = newsgroups.target_names
print("helloworld")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_top_documents(query, X, vectorizer, top_n=3):
  # Transform the query into the same TF-IDF space
  query_vec = vectorizer.transform([query])
  # Compute cosine similarities between the query and all documents
  similarities = cosine_similarity(query_vec, X)
  # Get indices of the top_n most similar documents
  top_indices = similarities[0].argsort()[-top_n:][::-1]
  return top_indices, similarities[0][top_indices]


In [None]:
queries = [
  "computer graphics",
  "religion and spirituality",
  "medical research"
]


In [None]:
for query in queries:
  print(f"Query: {query}")
  indices, scores = retrieve_top_documents(query, X, vectorizer)
  for idx, score in zip(indices, scores):
      print(f"Document: {data[idx][:200]}...")  # Display first 200 characters
      print(f"Similarity Score: {score:.4f}")
  print("\n" + "-"*50 + "\n")
