In [16]:
!pip install scikit-learn




[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
data = newsgroups.data
target = newsgroups.target
target_names = newsgroups.target_names


In [4]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data)

In [14]:
import re

def highlight_query_terms(query, document):
    # Escape special characters in the query
    query = re.escape(query)
    # Create a regular expression pattern to match the query terms
    pattern = r'(\b' + query + r'\b)'
    # Replace matched terms with highlighted versions
    highlighted_document = re.sub(pattern, r'<mark>\1</mark>', document, flags=re.IGNORECASE)
    return highlighted_document


In [9]:
def retrieve_top_documents(query, X, vectorizer, data, top_n=3):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, X)
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    top_documents = [(data[idx], similarities[0][idx]) for idx in top_indices]
    highlighted_documents = [(highlight_query_terms(query, doc), score) for doc, score in top_documents]
    return highlighted_documents


In [15]:
def format_output(query, highlighted_documents):
    print(f"Query: {query}\n")
    for i, (doc, score) in enumerate(highlighted_documents, 1):
        print(f"Document {i}:")
        print(f"Similarity Score: {score:.4f}")
        print(f"Content: {doc[:500]}...")  # Display the first 500 characters
        print("\n" + "-"*50 + "\n")


In [11]:
queries = [
    "computer graphics",
    "artificial intelligence",
    "quantum computing"
]


In [12]:
for query in queries:
    highlighted_docs = retrieve_top_documents(query, X, vectorizer, data)
    format_output(query, highlighted_docs)


Query: computer graphics

Document 1:
Similarity Score: 0.4859
Content: Technion - Israel Institute of Technology
         Department of Computer Science

       GRADUATE STUDIES IN <mark>COMPUTER GRAPHICS</mark>

Applications are invited for graduate students wishing
to specialize in <mark>computer graphics</mark> and related fields.
Active research is being conducted in the fields of
image rendering, geometric modelling and computer animation.
State of the art graphics workstations (Sun, Silicon Graphics)
and video equipment are available.
The Technion offers full...

--------------------------------------------------

Document 2:
Similarity Score: 0.4527
Content: EUROPEAN COMPUTER RESEARCH CENTRE

Research Positions in 3D Graphics

ECRC is currently expanding its research staff in three-dimensional
graphics. We are looking for highly qualified researchers with a PhD in
computer science and a proven ability to conduct highly innovative
research. Preference will be given to candidates