In [None]:
query = "artificial intelligence data"
print("Query:", query)

print("\nBoolean Retrieval:")
for idx, doc in boolean_search(query, documents):
    print(f"Doc {idx}: {doc}")

print("\nTF-IDF Cosine Similarity:")
for idx, score in tfidf_cosine_similarity(query, documents):
    print(f"Doc {idx}: {documents[idx]} | Score = {score:.4f}")

print("\nKL Divergence Similarity:")
for idx, score in kl_divergence_similarity(query, documents):
    print(f"Doc {idx}: {documents[idx]} | Score = {score:.4f}")


## Run Models
Try a query and see results from all three models.


In [None]:
def kl_divergence_similarity(query, docs):
    vocab = list(set(" ".join(docs + [query]).lower().split()))
    
    def text_distribution(text):
        words = text.lower().split()
        counts = Counter(words)
        dist = np.array([counts[w] for w in vocab], dtype=float)
        dist = (dist + 1) / (dist.sum() + len(vocab))  # Laplace smoothing
        return dist

    q_dist = text_distribution(query)
    results = []
    for idx, doc in enumerate(docs):
        d_dist = text_distribution(doc)
        div = entropy(q_dist, d_dist)
        sim = 1 / (1 + div)
        results.append((idx, sim))
    return sorted(results, key=lambda x: -x[1])


## KL Divergence Similarity
Compute similarity based on KL divergence between the query and each document.


In [None]:
def tfidf_cosine_similarity(query, docs):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(docs + [query])
    query_vec = X[-1]
    doc_vecs = X[:-1]
    scores = [(i, 1 - cosine(query_vec.toarray().ravel(), doc_vecs[i].toarray().ravel()))
              for i in range(len(docs))]
    return sorted(scores, key=lambda x: -x[1])


## TF-IDF Cosine Similarity
Compute similarity between query and documents using TF-IDF vectors and cosine similarity.


In [None]:
def boolean_search(query, docs):
    query_terms = set(query.lower().split())
    results = []
    for idx, doc in enumerate(docs):
        doc_terms = set(re.findall(r"\w+", doc.lower()))
        if query_terms.issubset(doc_terms):
            results.append((idx, doc))
    return results


## Boolean Retrieval
This function retrieves documents where **all query terms are present**.


In [None]:
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from scipy.stats import entropy

# Document collection for IR
documents = [
    "Cats are small domesticated carnivorous mammals",
    "Artificial intelligence helps computers learn from data",
    "Natural language processing enables machines to understand text",
    "Search engines retrieve relevant web pages for queries",
    "Text analysis extracts insights from written content",
    "Neural networks are the backbone of deep learning models",
    "Data analytics combines math, programming, and domain knowledge"
]


## Imports & Document Collection
This cell imports necessary libraries and defines the document collection for information retrieval experiments.


# Lab2: Basic Information Retrieval Models
This notebook demonstrates:
1. Boolean Retrieval
2. TF-IDF Cosine Similarity
3. KL Divergence Similarity
