In [4]:
import os
import re
import numpy as np
from collections import defaultdict
from math import log

In [5]:
# Preprocessing
def clean_text(text):
    return re.findall(r'\b\w+\b', text.lower())

# Load text files into a dictionary
def fetch_documents(directory_path):
    documents = {}
    for doc_file in os.listdir(directory_path):
        if doc_file.endswith('.txt'):
            with open(os.path.join(directory_path, doc_file), 'r') as f:
                documents[doc_file] = clean_text(f.read())
    return documents

# Loading query
def fetch_queries(query_path):
    with open(query_path, 'r') as f:
        return [line.strip() for line in f.readlines()]


In [6]:
# Calculate term frequencies and document frequencies
def calculate_frequencies(docs):
    total_docs = len(docs)
    term_in_docs = defaultdict(int)
    term_frequency = defaultdict(lambda: defaultdict(int))

    for document, terms in docs.items():
        unique_terms = set(terms)
        for term in terms:
            term_frequency[document][term] += 1
        for term in unique_terms:
            term_in_docs[term] += 1

    return term_frequency, term_in_docs, total_docs

# Compute BIM-based relevance scores
def get_relevance_score(query_terms, term_frequency, term_in_docs, total_docs):
    relevance_scores = {}
    for doc_name in term_frequency:
        probability_score = 1.0
        for term in query_terms:
            term_freq = term_frequency[doc_name].get(term, 0)
            doc_freq = term_in_docs.get(term, 0)
            prob_relevant = (term_freq + 1) / (sum(term_frequency[doc_name].values()) + len(term_in_docs))
            prob_non_relevant = (doc_freq + 1) / (total_docs - doc_freq + len(term_in_docs))
            probability_score *= (prob_relevant / prob_non_relevant)
        relevance_scores[doc_name] = probability_score
    return relevance_scores

In [7]:
# Core function to retrieve documents based on queries
def search_documents(directory_path, query_path):
    documents = fetch_documents(directory_path)
    query_list = fetch_queries(query_path)

    term_frequency, term_in_docs, total_docs = calculate_frequencies(documents)

    for query_text in query_list:
        query_words = clean_text(query_text)
        scores = get_relevance_score(query_words, term_frequency, term_in_docs, total_docs)
        sorted_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)
        print(f"Query: {query_text}")
        for doc_name, rel_score in sorted_docs:
            print(f"Document: {doc_name}, Score: {rel_score:.4f}")
        print()

In [8]:
# Calculate Average Precision (AP) for a single query
def calculate_average_precision(relevant_docs, ranked_docs):
    """
    Params:
      relevant_docs...A set of relevant document IDs for the query.
      ranked_docs.....A list of ranked document IDs for the query.

    Returns:
      Average Precision (float).
    """
    if not relevant_docs:
        return 0.0

    relevant_docs = set(relevant_docs)
    num_relevant = len(relevant_docs)

    score = 0.0
    num_hits = 0

    for i, (doc_id, _) in enumerate(ranked_docs):
        if doc_id in relevant_docs:
            num_hits += 1
            score += num_hits / (i + 1)  # Precision at each relevant doc position

    return score / num_relevant

# directory
doc_directory = '/content/drive/MyDrive/ tech400 Dataset and Query'
query_file = '/content/drive/MyDrive/query/final_query.txt'

search_documents(doc_directory, query_file)

Query: How to make beef fry
Document: french_fries.txt, Score: 0.0158
Document: egg_fired_rice.txt, Score: 0.0065
Document: beef_fry.txt, Score: 0.0053
Document: chicken_chilli.txt, Score: 0.0043
Document: butter_chicken.txt, Score: 0.0017
Document: beef_burger.txt, Score: 0.0016
Document: egg_salad.txt, Score: 0.0016
Document: lemonade.txt, Score: 0.0013
Document: hummus.txt, Score: 0.0009
Document: veg_momo.txt, Score: 0.0005
Document: Mutton_gravy.txt, Score: 0.0005
Document: eggless_choco_lava.txt, Score: 0.0004
Document: virgin_mojito.txt, Score: 0.0003
Document: chicken_biryani.txt, Score: 0.0003
Document: cheese_cake.txt, Score: 0.0002

Query: How to make butter chicken
Document: butter_chicken.txt, Score: 1.2931
Document: chicken_chilli.txt, Score: 0.0830
Document: egg_fired_rice.txt, Score: 0.0038
Document: chicken_biryani.txt, Score: 0.0030
Document: french_fries.txt, Score: 0.0015
Document: eggless_choco_lava.txt, Score: 0.0008
Document: egg_salad.txt, Score: 0.0007
Document