In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os
import re
import random
import math
from collections import defaultdict

def preprocess(text):
    return re.findall(r'\b\w+\b', text.lower())

def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                docs[filename] = preprocess(file.read())
    return docs

def load_queries(query_file_path):
    with open(query_file_path, 'r') as file:
        return [line.strip() for line in file.readlines()]

In [3]:
def assign_random_relevance(queries, documents, relevance_scale=(0, 1)):
    relevance_scores = {}
    for query in queries:
        relevance_scores[query] = {}
        for doc in documents:
            relevance_scores[query][doc] = random.randint(relevance_scale[0], relevance_scale[1])
    return relevance_scores

In [4]:
def assign_random_relevance(queries, documents, relevance_scale=(0, 1)):
    relevance_scores = {}
    for query in queries:
        relevance_scores[query] = {}
        for doc in documents:
            relevance_scores[query][doc] = random.randint(relevance_scale[0], relevance_scale[1])
    return relevance_scores

def save_relevance_scores_to_file(relevance_scores, output_file):
    with open(output_file, 'w') as f:
        for query, doc_scores in relevance_scores.items():
            for doc, score in doc_scores.items():
                f.write(f"{query},{doc},{score}\n")


In [5]:
def load_relevance_scores(file_path):
    relevance_scores = defaultdict(lambda: defaultdict(int))
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split(',')  # Assuming comma as the delimiter
                if len(parts) >= 3:  # Check if there are at least 3 values
                    query, doc, score = parts[:3]  # Take the first 3 values
                    try:
                        relevance_scores[query][doc] = int(score)
                    except ValueError:
                        print(f"Warning: Could not convert score to integer for query '{query}', doc '{doc}', score '{score}'")
    return relevance_scores

In [6]:
def tokenize(text):
    # Simple tokenization by splitting on whitespace
    return text.lower().split()

In [7]:
def create_inverted_index(documents):
    inverted_index = defaultdict(set)
    for doc_id, content in documents.items():
        # content is a list of words, so join them back into a string
        content_str = ' '.join(content)
        for term in tokenize(content_str):
            inverted_index[term].add(doc_id)
    return inverted_index

In [8]:
def calculate_term_probabilities(relevance_scores, inverted_index, documents):
    term_probs = {}
    num_relevant = sum(1 for scores in relevance_scores.values() for score in scores.values() if score == 1)
    num_nonrelevant = sum(1 for scores in relevance_scores.values() for score in scores.values() if score == 0)

    # Smoothing parameter
    alpha = 0.01  # Reduced Laplace smoothing parameter

    for term, doc_ids in inverted_index.items():
        relevant_with_term = sum(1 for query, scores in relevance_scores.items() for doc in doc_ids if scores.get(doc, 0) == 1)
        nonrelevant_with_term = sum(1 for query, scores in relevance_scores.items() for doc in doc_ids if scores.get(doc, 0) == 0)

        # Apply Laplace smoothing
        p_t_r = (relevant_with_term + alpha) / (num_relevant + 2 * alpha)
        p_t_nr = (nonrelevant_with_term + alpha) / (num_nonrelevant + 2 * alpha)

        # Ensure probabilities are within valid range
        p_t_r = max(1e-10, min(1 - 1e-10, p_t_r))
        p_t_nr = max(1e-10, min(1 - 1e-10, p_t_nr))

        term_probs[term] = (p_t_r, p_t_nr)

    return term_probs

In [9]:
def binary_independence_model(query, documents, term_probs, inverted_index):
    scores = defaultdict(float)
    query_terms = set(tokenize(query))

    # Smoothing parameter
    alpha = 0.01  # Reduced Laplace smoothing parameter

    for doc_id, content in documents.items():
        doc_terms = set(content)  # content is already a list of words

        for term in query_terms:
            if term in term_probs:
                p_t_r, p_t_nr = term_probs[term]

                # Apply Laplace smoothing
                p_t_r = (p_t_r * len(documents) + alpha) / (len(documents) + 2 * alpha)
                p_t_nr = (p_t_nr * len(documents) + alpha) / (len(documents) + 2 * alpha)

                # Ensure probabilities are within valid range
                p_t_r = max(1e-10, min(1 - 1e-10, p_t_r))
                p_t_nr = max(1e-10, min(1 - 1e-10, p_t_nr))

                if term in doc_terms:
                    scores[doc_id] += math.log((p_t_r * (1 - p_t_nr)) / (p_t_nr * (1 - p_t_r)))
                else:
                    scores[doc_id] += math.log((1 - p_t_r) / (1 - p_t_nr))

    # Normalize scores
    if scores:
        min_score = min(scores.values())
        max_score = max(scores.values())
        for doc_id in scores:
            if max_score > min_score:
                scores[doc_id] = (scores[doc_id] - min_score) / (max_score - min_score)
            else:
                scores[doc_id] = 0

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

In [10]:
def save_model_results(results, output_file):
    with open(output_file, 'w') as f:
        for query, doc_scores in results.items():
            f.write(f"Query: {query}\n")
            for doc_id, score in doc_scores:
                f.write(f"{doc_id},{score}\n")
            f.write("\n")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder not found. Please check the path.


In [16]:
def main():
  # Updated paths based on your provided folder structure
  folder_path = '/content/drive/MyDrive/TECH 400 Information Retrieval/cars'
  query_file_path = '/content/drive/MyDrive/TECH 400 Information Retrieval/queries_car.txt'
  output_file = '/content/drive/MyDrive/TECH 400 Information Retrieval/result/query_relevance_score.txt'
  model_results_file = '/content/drive/MyDrive/TECH 400 Information Retrieval/result/bim_model_results.txt'

  # Load documents and queries
  documents = load_documents(folder_path)
  queries = load_queries(query_file_path)

  # Assign random relevance scores to documents
  random_relevance_scores = assign_random_relevance(queries, documents.keys())

  # Save the relevance scores to a file
  save_relevance_scores_to_file(random_relevance_scores, output_file)
  print(f"Relevance scores saved to {output_file}")

  # Load relevance scores
  relevance_scores = load_relevance_scores(output_file)

  # Create inverted index from the documents
  inverted_index = create_inverted_index(documents)

  # Calculate term probabilities using relevance scores and the inverted index
  term_probs = calculate_term_probabilities(relevance_scores, inverted_index, documents)

  # Perform retrieval using the Binary Independence Model (BIM) for each query
  for query in queries:
    print(f"Query: {query}")
    results = binary_independence_model(query, documents, term_probs, inverted_index)

    # Display the top 5 relevant documents for each query
    print("Top 5 relevant documents:")
    for doc_id, score in results[:5]:
        print(f"Document: {doc_id}, Score: {score}")
    print()

  # Save the model results to a .txt file
  model_results = {}
  for query in queries:
    results = binary_independence_model(query, documents, term_probs, inverted_index)
    model_results[query] = results

  # Save results to the result file
  save_model_results(model_results, model_results_file)
  print(f"Model results saved to {model_results_file}")

if __name__ == "__main__":
  main()


Relevance scores saved to /content/drive/MyDrive/TECH 400 Information Retrieval/result/query_relevance_score.txt
Query: electric cars under $40k
Top 5 relevant documents:
Document: Tesla Model S Plaid.txt, Score: 1.0
Document: Ford F-150 Lightning.txt, Score: 1.0
Document: Toyota RAV4 Hybrid.txt, Score: 1.0
Document: BMW iX.txt, Score: 1.0
Document: Mercedes-Benz EQS.txt, Score: 1.0

Query: turbocharged SUVs 2024
Top 5 relevant documents:
Document: Lexus RX 500h.txt, Score: 1.0
Document: Volvo XC90 Recharge.txt, Score: 1.0
Document: Subaru Outback Wilderness.txt, Score: 1.0
Document: Genesis GV80.txt, Score: 1.0
Document: Jeep Grand Cherokee 4xe.txt, Score: 1.0

Query: cars with adaptive cruise control
Top 5 relevant documents:
Document: Tesla Model S Plaid.txt, Score: 1.0
Document: Ford F-150 Lightning.txt, Score: 1.0
Document: BMW iX.txt, Score: 1.0
Document: Hyundai Ioniq 6.txt, Score: 1.0
Document: Kia EV6.txt, Score: 1.0

Query: top 10 hybrid sedans
Top 5 relevant documents:
Docum