In [1]:
import os
import glob
import math
import nltk
import pandas as pd
from xml.etree import ElementTree as ET
from collections import defaultdict
import re
import string

In [2]:
def load_stop_words(file_path="common-english-words.txt"): #
    """Load stop words from a comma-separated file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            stop_words = f.read().strip().split(',')
        return set(sw.strip() for sw in stop_words if sw.strip()) # Ensure clean stop words
    except FileNotFoundError:
        print(f"Error: Stop words file '{file_path}' not found. Returning empty set.")
        return set()
    except Exception as e:
        print(f"Error loading stop words from {file_path}: {e}")
        return set()

In [3]:
def normalize_token(token_text):
    """
    Normalizes a token by converting to lowercase and stripping leading/trailing punctuation.
    Keeps internal punctuation.
    """
    if not token_text:
        return None
    term_lower = token_text.lower()
    # Strip leading and trailing punctuation characters.
    # e.g., "U.S." -> "u.s.", "(word)" -> "word", "word." -> "word"
    normalized = term_lower.strip(string.punctuation)
    # Return the normalized term if it's not empty, otherwise None
    return normalized if normalized else None

In [4]:
def parse_queries(query_file): #
    """Parse query file to extract query numbers and titles."""
    queries = {}
    try:
        with open(query_file, 'r', encoding='utf-8') as f:
            content = f.read()
            # Split by "<Query>" and handle potential empty string at the beginning
            query_blocks = content.split('<Query>')
            if query_blocks and query_blocks[0].strip() == "":
                query_blocks = query_blocks[1:]

            for block in query_blocks:
                if not block.strip():
                    continue
                num_match = re.search(r'<num>\s*Number:\s*(R\d+)', block) #
                # MODIFIED REGEX FOR TITLE to capture full line content
                title_match = re.search(r'<title>\s*([^\n<]+)', block) 
                
                if num_match and title_match:
                    query_num = num_match.group(1)
                    title = title_match.group(1).strip() #
                    queries[query_num] = title
                else:
                    num_found = "Found" if num_match else "Not Found"
                    title_found = "Found" if title_match else "Not Found"
                    # print(f"Skipping query block: Num: {num_found}, Title: {title_found}. Block: {block[:70]}...")
    except FileNotFoundError:
        print(f"Error: Query file '{query_file}' not found.")
        return {}
    except Exception as e:
        print(f"Error parsing {query_file}: {e}")
        return {}
    return queries

In [5]:
def parse_dataset(folder, stop_words): #
    """Parse XML documents in a dataset folder into a collection of term frequencies."""
    docs = defaultdict(lambda: defaultdict(int)) # Ensure docs is initialized correctly
    for file_path in glob.glob(os.path.join(folder, "*.xml")):
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            # Namespace removal for tags
            for elem in root.iter():
                if '}' in elem.tag:
                    elem.tag = elem.tag.split('}', 1)[1] #
            
            newsitem = root if root.tag == 'newsitem' else root.find(".//newsitem") #
            if newsitem is None:
                # print(f"Skipping {file_path}: No <newsitem> tag found")
                continue
            doc_id = newsitem.get("itemid") #
            if doc_id is None:
                # print(f"Skipping {file_path}: No itemid attribute found")
                continue
            
            text_elements = newsitem.findall(".//text/p") #
            if not text_elements:
                # print(f"Skipping {file_path}: No <text><p> elements found")
                continue
            
            text_content = " ".join([e.text for e in text_elements if e.text]) #
            
            # Using new normalization for document terms
            raw_doc_tokens = nltk.word_tokenize(text_content)
            processed_doc_terms = []
            for token in raw_doc_tokens:
                normalized = normalize_token(token)
                if normalized:
                    processed_doc_terms.append(normalized)
            
            final_doc_terms = [t for t in processed_doc_terms if t not in stop_words] #
            
            # docs[doc_id] = defaultdict(int) # This was in the original, moved up.
            for t in final_doc_terms:
                docs[doc_id][t] += 1
        except ET.ParseError:
            print(f"XML Parse Error in {file_path}. Skipping this file.")
        except Exception as e:
            print(f"Error parsing document {file_path}: {e}")
    return dict(docs) # Convert back to dict for more standard behavior if preferred

In [6]:
def bm25_score(query_terms, docs, k1=1.2, k2=500, b=0.75): #
    """Calculate BM25 scores for all documents."""
    N = len(docs)
    if N == 0:
        return []
        
    dl = {doc_id: sum(freqs.values()) for doc_id, freqs in docs.items()} #
    avdl = sum(dl.values()) / N #
    if avdl == 0: avdl = 1.0 # Avoid division by zero if all docs are empty

    # Calculate document frequency (n_i) for each term in the entire collection
    doc_freq = defaultdict(int) # n_i: number of docs containing term i
    for doc_content in docs.values():
        for term in set(doc_content.keys()): # Iterate over unique terms in this document
            doc_freq[term] += 1
            
    scores = {}
    query_term_counts = defaultdict(int) # qf_i: frequency of term i in the query
    for term in query_terms:
        query_term_counts[term] += 1

    for doc_id, doc_content in docs.items():
        score = 0.0
        doc_len = dl.get(doc_id, 0)
        K = k1 * ((1 - b) + b * (doc_len / avdl)) #
        
        for term in set(query_terms): # Iterate over unique terms in the query
            if term not in doc_content and term not in doc_freq : # Term not in query, or not in any doc
                 continue

            f_i = doc_content.get(term, 0) # Frequency of term i in the current document D
            qf_i = query_term_counts[term]  # Frequency of term i in the query Q
            n_i = doc_freq.get(term, 0)    # Number of documents containing term i

            # IDF calculation using the formula from the notebook
            # IDF_i = math.log2((N - n_i + 0.5) / (n_i + 0.5) + 1)
            idf_val_num = (N - n_i + 0.5)
            idf_val_den = (n_i + 0.5)
            
            if idf_val_den == 0: # Should not happen if n_i is from doc_freq of existing terms
                idf_component = 0 
            else:
                idf_component = idf_val_num / idf_val_den

            if idf_component < 0: idf_component = 0 # Ensure argument to log is not negative

            IDF_i = math.log2(idf_component + 1) # (Adding 1 before log2)
            
            doc_term_weight = ((k1 + 1) * f_i) / (K + f_i) #
            query_term_weight = ((k2 + 1) * qf_i) / (k2 + qf_i) #
            
            score += IDF_i * doc_term_weight * query_term_weight
            
        scores[doc_id] = score
    return sorted(scores.items(), key=lambda x: x[1], reverse=True) #

In [7]:
def run_bm25ir(query_file, dataset_base_path, output_folder, stop_words, top12_excel): #
    """Run BM25IR on all datasets and save top-12 results."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    all_top12_data = []
    queries = parse_queries(query_file) #
    
    print(f"Total queries parsed: {len(queries)}")
    expected_query_ids = {f"R{num}" for num in range(101, 151)} #
    
    # Process queries in sorted order for consistent output
    for query_num in sorted(list(expected_query_ids)):
        if query_num not in queries:
            print(f"Query {query_num} not found in Queries-1.txt. Skipping.")
            for rank_idx in range(1, 13):
                all_top12_data.append({'Query': query_num, 'Rank': rank_idx, 'Document_ID': 'N/A - Query Missing', 'BM25_Score': 0.0})
            continue

        current_query_title = queries[query_num]
        dataset_folder = os.path.join(dataset_base_path, f"Dataset{query_num[1:]}") #

        if not os.path.exists(dataset_folder):
            print(f"Dataset folder {dataset_folder} not found for Query {query_num}. Skipping.")
            for rank_idx in range(1, 13):
                all_top12_data.append({'Query': query_num, 'Rank': rank_idx, 'Document_ID': 'N/A - Dataset Missing', 'BM25_Score': 0.0})
            continue
        
        print(f"\nProcessing Query {query_num} (Title: '{current_query_title}')")
        print(f"Dataset {dataset_folder}: Found {len(glob.glob(os.path.join(dataset_folder, '*.xml')))} XML files")
        
        docs = parse_dataset(dataset_folder, stop_words) #
        if not docs:
            print(f"No documents parsed for {dataset_folder} (Query {query_num}). Skipping.")
            for rank_idx in range(1, 13):
                all_top12_data.append({'Query': query_num, 'Rank': rank_idx, 'Document_ID': 'N/A - No Docs Parsed', 'BM25_Score': 0.0})
            continue
        print(f"Dataset {dataset_folder}: Parsed {len(docs)} documents.")

        # Using new normalization for query terms
        raw_query_tokens = nltk.word_tokenize(current_query_title)
        processed_query_terms = []
        for token in raw_query_tokens:
            normalized = normalize_token(token)
            if normalized:
                processed_query_terms.append(normalized)
        query_terms = [t for t in processed_query_terms if t not in stop_words] #
        
        print(f"Final query terms for {query_num}: {query_terms}")

        if not query_terms:
            print(f"Query {query_num} has no terms after processing. Skipping ranking.")
            for rank_idx in range(1, 13):
                all_top12_data.append({'Query': query_num, 'Rank': rank_idx, 'Document_ID': 'N/A - No Query Terms', 'BM25_Score': 0.0})
            continue
            
        ranked_docs = bm25_score(query_terms, docs) #
        output_file_path = os.path.join(output_folder, f"BM25IR_{query_num}Ranking.dat") #
        
        with open(output_file_path, "w", encoding="utf-8") as f_out:
            for doc_id, score in ranked_docs:
                f_out.write(f"{doc_id} {score}\n") #
        print(f"Saved rankings for {query_num} to {output_file_path}")
        
        print(f"Top-12 documents for Query_{query_num}:")
        for i, (doc_id, score) in enumerate(ranked_docs[:12]):
            print(f"  Rank {i+1}: {doc_id} (Score: {score:.4f})")
            all_top12_data.append({
                'Query': query_num, 'Rank': i + 1, 'Document_ID': doc_id, 'BM25_Score': score
            })
        
        # Add placeholders if fewer than 12 results were found
        for rank_idx in range(len(ranked_docs) + 1, 13):
            all_top12_data.append({
                'Query': query_num, 'Rank': rank_idx, 'Document_ID': 'N/A - Fewer than 12 results', 'BM25_Score': 0.0
            })

    df_top12 = pd.DataFrame(all_top12_data) #
    df_top12 = df_top12.reindex(columns=['Query', 'Rank', 'Document_ID', 'BM25_Score']) # Ensure column order

    try:
        with pd.ExcelWriter(top12_excel, engine='xlsxwriter') as writer: #
            df_top12.to_excel(writer, sheet_name='BM25IR Top 12', index=False) #
        print(f"\nTop-12 documents for all queries saved to {top12_excel}")
    except Exception as e:
        print(f"Error writing to Excel file {top12_excel}: {e}")
        csv_fallback_path = top12_excel.replace(".xlsx", "_fallback.csv")
        try:
            df_top12.to_csv(csv_fallback_path, index=False)
            print(f"Saved top-12 results to CSV as fallback: {csv_fallback_path}")
        except Exception as e_csv:
            print(f"Error writing to CSV fallback {csv_fallback_path}: {e_csv}")

In [15]:
# --- Main Execution ---
if __name__ == '__main__': # Good practice to put main execution code here
    QUERY_FILE = "Queries-1.txt" #
    DATASET_BASE_PATH = "DataSets-1" #
    OUTPUT_FOLDER = "BM25RankingOutputs" #
    STOP_WORDS_FILE = "common-english-words.txt" #
    TOP12_EXCEL = "BM25IR_Top12_Results.xlsx" #

    print("Loading stop words...")
    STOP_WORDS = load_stop_words(STOP_WORDS_FILE)
    if not STOP_WORDS:
        print("Warning: No stop words loaded. Proceeding with an empty stop word list.")
    else:
        print(f"Loaded {len(STOP_WORDS)} stop words.")

    print("\nRunning BM25IR System...")
    run_bm25ir(QUERY_FILE, DATASET_BASE_PATH, OUTPUT_FOLDER, STOP_WORDS, TOP12_EXCEL)
    print("\nBM25IR System run complete.")


Loading stop words...
Loaded 119 stop words.

Running BM25IR System...
Total queries parsed: 50

Processing Query R101 (Title: 'Economic espionage')
Dataset DataSets-1\Dataset101: Found 23 XML files
Dataset DataSets-1\Dataset101: Parsed 23 documents.
Final query terms for R101: ['economic', 'espionage']
Saved rankings for R101 to BM25RankingOutputs\BM25IR_R101Ranking.dat
Top-12 documents for Query_R101:
  Rank 1: 46547 (Score: 6.3310)
  Rank 2: 46974 (Score: 6.3310)
  Rank 3: 62325 (Score: 4.7737)
  Rank 4: 6146 (Score: 2.7870)
  Rank 5: 61329 (Score: 2.6003)
  Rank 6: 22170 (Score: 2.5127)
  Rank 7: 61780 (Score: 2.2597)
  Rank 8: 22513 (Score: 1.7784)
  Rank 9: 82330 (Score: 1.4353)
  Rank 10: 39496 (Score: 1.1719)
  Rank 11: 18586 (Score: 0.0000)
  Rank 12: 26642 (Score: 0.0000)

Processing Query R102 (Title: 'Convicts, repeat offenders')
Dataset DataSets-1\Dataset102: Found 199 XML files
Dataset DataSets-1\Dataset102: Parsed 199 documents.
Final query terms for R102: ['convicts', '