In [230]:
# Install libraries if not already installed
#!pip install requests beautifulsoup4 nltk

# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import json
import nltk
import re
import math
from collections import defaultdict

# Download NLTK stopwords and punkt tokenizer
#nltk.download("stopwords")
#nltk.download("punkt")

# Initialize stop words and stemmer
stop_words = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.PorterStemmer()

In [232]:
# Prompt the user for Wikipedia page titles
print("Enter Wikipedia page titles separated by commas (e.g., 'Artificial Intelligence, Machine Learning, Data Science'):")
page_titles_input = input().split(',')

# Clean up and prepare page titles list
page_titles = [title.strip() for title in page_titles_input]

# Function to Scrape Wikipedia Pages
def get_wikipedia_page_content(page_title):
    url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    paragraphs = soup.find_all("p")
    
    # Extract text content from paragraphs
    content = " ".join([para.text for para in paragraphs if para.text])
    return {"title": page_title, "content": content}

# Fetch documents for each title provided by the user
documents = [get_wikipedia_page_content(title) for title in page_titles]

# Save raw documents to JSON file
with open("wikipedia_data.json", "w") as f:
    json.dump(documents, f)

# Display fetched documents to verify content
#documents[:1]  # Show first document as an example



Enter Wikipedia page titles separated by commas (e.g., 'Artificial Intelligence, Machine Learning, Data Science'):


 New Jersey , Breaking Bad ,The Sopranos


In [233]:
# Preprocessing function
def preprocess_text(text):
    # Lowercase and remove special characters
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    
    # Tokenize, remove stop words, and apply stemming
    tokens = nltk.word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to each document and save to processed_data.json
processed_documents = [{"title": doc["title"], "content": preprocess_text(doc["content"])} for doc in documents]

with open("processed_data.json", "w") as f:
    json.dump(processed_documents, f)

# Display processed documents to verify
#processed_documents[:1]  # Show first processed document as an example


In [234]:
# Initialize an inverted index
inverted_index = defaultdict(dict)

# Populate the inverted index with term frequencies
for doc_id, doc in enumerate(processed_documents):
    for term in doc["content"]:
        if doc_id in inverted_index[term]:
            inverted_index[term][doc_id] += 1
        else:
            inverted_index[term][doc_id] = 1

# Save the inverted index to a JSON file (optional)
with open("inverted_index.json", "w") as f:
    json.dump(inverted_index, f)

In [235]:
# Parse the query with Boolean operators
def parse_query(query):
    terms = query.lower().split()
    tokens = []
    operators = {"and", "or", "not"}
    
    for term in terms:
        if term in operators:
            tokens.append(term)
        else:
            processed_term = stemmer.stem(term) if term not in stop_words else ""
            if processed_term:
                tokens.append(processed_term)
    
    return tokens

# Boolean retrieval function
def boolean_retrieval(query_tokens):
    result_set = set(range(len(processed_documents)))  # Start with all documents
    current_operation = "and"
    
    for token in query_tokens:
        if token in {"and", "or", "not"}:
            current_operation = token
        else:
            matching_docs = set(inverted_index.get(token, {}).keys())
            if current_operation == "and":
                result_set &= matching_docs
            elif current_operation == "or":
                result_set |= matching_docs
            elif current_operation == "not":
                result_set -= matching_docs
    
    return list(result_set)

In [237]:
# Compute TF-IDF weight for a term in a document
def compute_tf_idf(term, doc_id):
    term_frequency = inverted_index[term].get(doc_id, 0)
    if term_frequency == 0:
        return 0
    document_frequency = len(inverted_index[term])
    inverse_document_frequency = math.log(len(processed_documents) / (1 + document_frequency))
    return term_frequency * inverse_document_frequency

# Rank results by TF-IDF scores
def rank_results_tf_idf(query_tokens, result_docs):
    doc_scores = {}
    for doc_id in result_docs:
        score = 0
        for term in query_tokens:
            if term not in {"and", "or", "not"}:
                score += compute_tf_idf(term, doc_id)
        doc_scores[doc_id] = score
    return sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)


In [238]:
# Convert document to TF-IDF vector
def document_to_vector(doc_id):
    doc_vector = {}
    for term in processed_documents[doc_id]["content"]:
        doc_vector[term] = compute_tf_idf(term, doc_id)
    return doc_vector

# Convert query to TF-IDF vector
def query_to_vector(query_tokens):
    query_vector = {}
    for term in query_tokens:
        if term not in {"and", "or", "not"}:
            term_frequency = query_tokens.count(term)
            document_frequency = len(inverted_index.get(term, []))
            inverse_document_frequency = math.log(len(processed_documents) / (1 + document_frequency))
            query_vector[term] = (term_frequency / len(query_tokens)) * inverse_document_frequency
    return query_vector

# Compute cosine similarity between two vectors
def cosine_similarity(query_vector, doc_vector):
    dot_product = sum(query_vector[term] * doc_vector.get(term, 0) for term in query_vector)
    query_norm = math.sqrt(sum(val ** 2 for val in query_vector.values()))
    doc_norm = math.sqrt(sum(val ** 2 for val in doc_vector.values()))
    if query_norm == 0 or doc_norm == 0:
        return 0
    return dot_product / (query_norm * doc_norm)

# VSM retrieval function
def vector_space_model(query_tokens):
    query_vector = query_to_vector(query_tokens)
    doc_scores = {}
    for doc_id in range(len(processed_documents)):
        doc_vector = document_to_vector(doc_id)
        similarity = cosine_similarity(query_vector, doc_vector)
        if similarity > 0:
            doc_scores[doc_id] = similarity
    return sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)


In [239]:
# Main search function with algorithm selection
def search_documents(query, algorithm="tf-idf"):
    query_tokens = parse_query(query)
    
    if algorithm == "boolean":
        results = boolean_retrieval(query_tokens)
        print("Boolean Retrieval Results:")
        for doc_id in results:
            print(f"Document: {documents[doc_id]['title']}")
        return results
    elif algorithm == "tf-idf":
        result_docs = boolean_retrieval(query_tokens)
        ranked_results = rank_results_tf_idf(query_tokens, result_docs)
        print("\nTF-IDF Ranked Results (Document ID and Score):")
        for doc_id, score in ranked_results:
            print(f"Document: {documents[doc_id]['title']} - Score: {score:.4f}")
        return ranked_results
    elif algorithm == "vsm":
        ranked_results = vector_space_model(query_tokens)
        print("\nVector Space Model Ranked Results (Document ID and Cosine Similarity):")
        for doc_id, similarity in ranked_results:
            print(f"Document: {documents[doc_id]['title']} - Cosine Similarity: {similarity:.4f}")
        return ranked_results
    else:
        print("Unsupported algorithm selected.")
        return []

In [254]:
# Prompt user for query and algorithm selection
print("Enter your search query:")
query = input()

print("Choose retrieval algorithm:\n1. Boolean Retrieval\n2. TF-IDF Ranking\n3. Vector Space Model")
choice = input("Enter choice (1, 2, or 3): ")

if choice == "1":
    search_documents(query, algorithm="boolean")
elif choice == "2":
    search_documents(query, algorithm="tf-idf")
elif choice == "3":
    search_documents(query, algorithm="vsm")
else:
    print("Invalid choice. Please enter 1, 2, or 3.")

Enter your search query:


 New Mexico OR New Jersey


Choose retrieval algorithm:
1. Boolean Retrieval
2. TF-IDF Ranking
3. Vector Space Model


Enter choice (1, 2, or 3):  2



TF-IDF Ranked Results (Document ID and Score):
Document: Breaking Bad - Score: -5.7536
Document: The Sopranos - Score: -30.4943
Document: New Jersey - Score: -193.3224
