In [None]:
import os
import re
import numpy as np
from collections import defaultdict
from math import log


In [None]:
# Preprocessing function
def preprocess(text):
    return re.findall(r'\b\w+\b', text.lower())


In [None]:

# Load documents
def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                docs[filename] = preprocess(file.read())
    return docs


In [None]:
# Load queries
def load_queries(query_file_path):
    with open(query_file_path, 'r') as file:
        return [line.strip() for line in file.readlines()]


In [None]:
# Compute term frequencies and document frequencies
def compute_statistics(docs):
    doc_count = len(docs)
    term_doc_freq = defaultdict(int)
    term_freq = defaultdict(lambda: defaultdict(int))

    for doc_id, words in docs.items():
        word_set = set(words)
        for word in words:
            term_freq[doc_id][word] += 1
        for word in word_set:
            term_doc_freq[word] += 1

    return term_freq, term_doc_freq, doc_count

In [None]:
# Compute relevance probabilities using BIM
def compute_relevance_prob(query, term_freq, term_doc_freq, doc_count):
    scores = {}
    for doc_id in term_freq:
        score = 1.0
        for term in query:
            tf = term_freq[doc_id].get(term, 0)
            df = term_doc_freq.get(term, 0)
            p_term_given_relevant = (tf + 1) / (sum(term_freq[doc_id].values()) + len(term_doc_freq))
            p_term_given_not_relevant = (df + 1) / (doc_count - df + len(term_doc_freq))
            score *= (p_term_given_relevant / p_term_given_not_relevant)
        scores[doc_id] = score
    return scores

In [None]:
import os

# Main retrieval function
def retrieve_documents(folder_path, query_file_path):
    docs = load_documents(folder_path)
    queries = load_queries(query_file_path)

    term_freq, term_doc_freq, doc_count = compute_statistics(docs)

    # Define the output file path in the same directory as the dataset folder
    output_file_path = os.path.join(folder_path, 'Prahar_result.txt')

    # Open a file to write results in the same directory as the dataset
    with open(output_file_path, 'w') as result_file:
        for query in queries:
            query_terms = preprocess(query)
            scores = compute_relevance_prob(query_terms, term_freq, term_doc_freq, doc_count)
            ranked_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)

            # Print to console and write to file
            print(f"Query: {query}")
            result_file.write(f"Query: {query}\n")

            # Print and write only the top 3 results with ranks
            for rank, (doc_id, score) in enumerate(ranked_docs[:3], start=1):
                print(f"Rank {rank}: Document {doc_id}, Score: {score:.4f}")
                result_file.write(f"Rank {rank}: Document {doc_id}, Score: {score:.4f}\n")

            print()
            result_file.write("\n")

# Example usage
folder_path = '/content/drive/MyDrive/Information Retrieval System/Week4/docs'
query_file_path = '/content/drive/MyDrive/Information Retrieval System/Week4/queries.txt'
retrieve_documents(folder_path, query_file_path)


Query: iPhone battery drain
Rank 1: Document Docs1.txt, Score: 0.3582
Rank 2: Document Docs18.txt, Score: 0.2107
Rank 3: Document Docs17.txt, Score: 0.1050

Query: Spotify Bluetooth issue
Rank 1: Document Docs18.txt, Score: 0.0618
Rank 2: Document Docs10.txt, Score: 0.0388
Rank 3: Document Docs2.txt, Score: 0.0354

Query: Train delay update
Rank 1: Document Docs5.txt, Score: 0.1876
Rank 2: Document Docs19.txt, Score: 0.1783
Rank 3: Document Docs7.txt, Score: 0.1479

Query: O2 telemarketing calls
Rank 1: Document Docs17.txt, Score: 0.8420
Rank 2: Document Docs18.txt, Score: 0.4226
Rank 3: Document Docs19.txt, Score: 0.4194

Query: iOS downgrade
Rank 1: Document Docs1.txt, Score: 1.5297
Rank 2: Document Docs18.txt, Score: 0.4468
Rank 3: Document Docs17.txt, Score: 0.4457

Query: British Airways delay
Rank 1: Document Docs2.txt, Score: 0.6457
Rank 2: Document Docs18.txt, Score: 0.4226
Rank 3: Document Docs17.txt, Score: 0.4210

Query: Tesco website down
Rank 1: Document Docs18.txt, Score: