In [5]:
import json
import os
from collections import defaultdict

In [4]:
jsonl_file_path = "data/llm4eval_document_2024.jsonl"
# --- End Configuration ---

# Check if the file exists (optional but good practice)
if not os.path.exists(jsonl_file_path):
    print(f"Error: File not found at '{jsonl_file_path}'")
else:
    print(f"Found file: '{jsonl_file_path}'. Processing...")
    unique_docids = set()
    line_count = 0
    error_count = 0

    try:
        # Open the JSONL file for reading
        with open(jsonl_file_path, "r", encoding="utf-8") as f:
            for line in f:
                line_count += 1
                # Strip leading/trailing whitespace
                line = line.strip()
                # Skip empty lines
                if not line:
                    continue

                try:
                    # Parse the JSON string on the current line into a Python dict
                    data = json.loads(line)

                    # Extract the docid (use .get() for safety if key might be missing)
                    docid = data.get("docid")

                    # Add the docid to the set if it exists
                    # Sets automatically handle uniqueness
                    if docid is not None:
                        unique_docids.add(docid)
                    else:
                        # Optional: Warn if a line is missing 'docid'
                        print(
                            f"Warning: 'docid' key not found in line {line_count}: {line[:100]}..."
                        )  # Print first 100 chars
                        error_count += 1

                except json.JSONDecodeError:
                    # Handle lines that are not valid JSON
                    print(
                        f"Error: Could not decode JSON on line {line_count}: {line[:100]}..."
                    )  # Print first 100 chars
                    error_count += 1
                except Exception as e:
                    # Handle other potential errors during processing a line
                    print(f"Error processing line {line_count}: {e}")
                    error_count += 1

        # Calculate the number of unique docids found
        num_unique_docids = len(unique_docids)

        # Print the results
        print("\n--- Analysis Complete ---")
        print(f"Total lines processed: {line_count}")
        print(f"Number of unique docids found: {num_unique_docids}")
        if error_count > 0:
            print(f"Number of lines with errors or missing 'docid': {error_count}")

    except FileNotFoundError:
        # This case is handled by the initial check, but good to have redundancy
        print(f"Error: The file '{jsonl_file_path}' was not found.")
    except Exception as e:
        # Handle potential errors during file opening/reading
        print(f"An unexpected error occurred: {e}")

Found file: 'data/llm4eval_document_2024.jsonl'. Processing...

--- Analysis Complete ---
Total lines processed: 11621
Number of unique docids found: 11621


In [None]:
# calculate number of unique query ids in data/llm4eval_dev_qrel_2024.txt
# Total of 50 query ids split between dev and test txt files with the relevance scores of dev being known to us
input_txt_file_path = "data/llm4eval_dev_qrel_2024.txt"

# Check if the file exists
if not os.path.exists(input_txt_file_path):
    print(f"Error: File not found at '{input_txt_file_path}'")
else:
    print(f"Found file: '{input_txt_file_path}'. Processing...")

    # Use defaultdict(set) to automatically create a new set for unseen queries
    # and store unique docids for each query
    query_to_docids = defaultdict(set)
    all_unique_queries = set()  # To count total unique queries easily

    line_count = 0
    error_count = 0

    try:
        with open(input_txt_file_path, "r", encoding="utf-8") as f:
            for line in f:
                line_count += 1
                line = line.strip()  # Remove leading/trailing whitespace

                if not line:  # Skip empty lines
                    continue

                parts = line.split()  # Split the line by spaces

                # Expecting at least 3 parts: query_id, something, doc_id, ...
                if len(parts) >= 3:
                    query_id = parts[0]
                    doc_id = parts[2]

                    # Add query to the set of all unique queries
                    all_unique_queries.add(query_id)

                    # Add the doc_id to the set associated with this query_id
                    # defaultdict handles creating the set if query_id is new
                    query_to_docids[query_id].add(doc_id)

                else:
                    # Line doesn't have the expected format
                    print(
                        f"Warning: Skipping malformed line {line_count}: {line[:100]}..."
                    )  # Show first 100 chars
                    error_count += 1

        # --- Analysis Results ---
        total_unique_queries = len(all_unique_queries)  # Or len(query_to_docids)

        print("\n--- Analysis Complete ---")
        print(f"Total lines processed: {line_count}")
        print(f"Total unique queries found: {total_unique_queries}")
        if error_count > 0:
            print(f"Number of lines skipped due to formatting errors: {error_count}")

        print("\n--- Unique Document Counts per Query ---")
        # Sort queries for consistent output (optional)
        sorted_queries = sorted(query_to_docids.keys())

        if not sorted_queries:
            print("No valid query-document pairs found.")
        else:
            for query_id in sorted_queries:
                unique_doc_count = len(query_to_docids[query_id])
                print(f"Query '{query_id}': {unique_doc_count} unique documents")

    except FileNotFoundError:
        # Handled by the initial check, but good for robustness
        print(f"Error: The file '{input_txt_file_path}' was not found.")
    except Exception as e:
        # Catch other potential errors during file reading/processing
        print(f"An unexpected error occurred: {e}")

Found file: 'data/llm4eval_dev_qrel_2024.txt'. Processing...

--- Analysis Complete ---
Total lines processed: 7263
Total unique queries found: 25

--- Unique Document Counts per Query ---
Query 'q10': 614 unique documents
Query 'q11': 270 unique documents
Query 'q12': 259 unique documents
Query 'q17': 403 unique documents
Query 'q18': 374 unique documents
Query 'q20': 188 unique documents
Query 'q21': 352 unique documents
Query 'q23': 345 unique documents
Query 'q24': 203 unique documents
Query 'q26': 282 unique documents
Query 'q27': 359 unique documents
Query 'q28': 362 unique documents
Query 'q29': 166 unique documents
Query 'q3': 201 unique documents
Query 'q39': 169 unique documents
Query 'q40': 231 unique documents
Query 'q41': 215 unique documents
Query 'q42': 337 unique documents
Query 'q44': 214 unique documents
Query 'q47': 170 unique documents
Query 'q48': 217 unique documents
Query 'q5': 543 unique documents
Query 'q6': 232 unique documents
Query 'q7': 224 unique documents