# SciFact IR System Pipeline
Run all cells in order: **Preprocessing & Indexing → Ranking** → **Convert Qrels** → **Evaluation**

## Setup
Ensure the working directory is `IR_Files/` so all relative imports and file paths work.

In [1]:
import os
os.chdir(os.path.dirname(os.path.abspath('__file__')))
# If running from the notebook's location, ensure we're in IR_Files
if os.path.basename(os.getcwd()) != 'IR_Files':
    os.chdir('IR_Files')
print(f'Working directory: {os.getcwd()}')

Working directory: c:\Users\dongs\Vector_Space_Model_Based_Information_Retrieval_System_for_the_SciFact_Dataset\IR_Files


## Step 1 & 2 & 3: Preprocessing, Indexing, Retrieval & Ranking
Runs `main.py` — loads corpus, preprocesses, builds inverted index, ranks documents, and writes the `Results` file.

In [2]:
import time
from parser import parse_documents_from_file, parse_queries_from_file
from preprocessing import load_stopwords, preprocess_documents, preprocess_queries
from indexing import (
    build_inverted_index,
    calculate_document_frequencies,
    calculate_document_lengths,
    save_inverted_index,
    load_inverted_index,
)
from ranking import VectorSpaceModel
from utils import save_preprocessed_data, load_preprocessed_data

BASE_DIR = os.getcwd()
dataset_path = os.path.join(BASE_DIR, '..', 'scifact')
doc_folder_path = os.path.join(dataset_path, 'corpus.jsonl')
query_file_path = os.path.join(dataset_path, 'queries.jsonl')
stopwords_path = os.path.join(BASE_DIR, '..', 'List of Stopwords.html')
index_file_path = os.path.join(BASE_DIR, 'inverted_index.json')
preprocessed_docs_path = os.path.join(BASE_DIR, 'preprocessed_documents.json')
preprocessed_queries_path = os.path.join(BASE_DIR, 'preprocessed_queries.json')

USE_STEMMING = True

start_time = time.time()

# Load stopwords
print('Loading stopwords')
stopwords = load_stopwords(stopwords_path)
print(f'Loaded {len(stopwords)} stopwords')

# Preprocess documents
if os.path.exists(preprocessed_docs_path):
    print('Loading preprocessed documents')
    documents = load_preprocessed_data(preprocessed_docs_path)
else:
    print('Preprocessing documents')
    documents = parse_documents_from_file(doc_folder_path)
    documents = preprocess_documents(documents, stopwords, stem=USE_STEMMING)
    save_preprocessed_data(documents, preprocessed_docs_path)

# Preprocess queries
if os.path.exists(preprocessed_queries_path):
    print('Loading preprocessed queries')
    queries = load_preprocessed_data(preprocessed_queries_path)
else:
    print('Preprocessing queries')
    all_queries = parse_queries_from_file(query_file_path)
    queries = [q for q in all_queries if int(q['num']) % 2 == 1]
    print(f'Filtered to {len(queries)} test queries (odd IDs only)')
    queries = preprocess_queries(queries, stopwords, stem=USE_STEMMING)
    save_preprocessed_data(queries, preprocessed_queries_path)

# Build or load inverted index
start_time = time.time()
try:
    inverted_index, doc_freqs, doc_lengths = load_inverted_index(index_file_path)
    print('Inverted index loaded successfully.')
except FileNotFoundError:
    print('Inverted index not found, building a new one.')
    inverted_index = build_inverted_index(documents)
    doc_freqs = calculate_document_frequencies(inverted_index)
    doc_lengths = calculate_document_lengths(documents)
    save_inverted_index(inverted_index, doc_freqs, doc_lengths, index_file_path)
    print(f'Time taken to build inverted index: {time.time() - start_time:.2f} seconds')

# Rank documents
print('Initializing vector space model')
vsm = VectorSpaceModel(inverted_index, doc_freqs, doc_lengths)

queries_sorted = sorted(queries, key=lambda q: int(q['num']))
results_file = 'Results'
run_name = 'vsm_tfidf'

print('Ranking and writing to results file')
with open(results_file, 'w', encoding='utf-8') as output_file:
    for query in queries_sorted:
        query_id = query['num']
        ranked_docs = vsm.rank_documents(query.get('tokens', []), top_k=100)
        for rank, (doc_id, score) in enumerate(ranked_docs, start=1):
            output_file.write(f'{query_id} Q0 {doc_id} {rank} {score:.6f} {run_name}\n')

print(f'Ranking results written to {results_file}')

Loading stopwords
Loaded 779 stopwords
Loading preprocessed documents
Loading preprocessed queries
Inverted index loaded successfully.
Initializing vector space model
Ranking and writing to results file
Ranking results written to Results


## Convert Qrels
Convert `test.tsv` to TREC eval format (`test.qrels`).

In [3]:
from utils import convert_tsv_to_qrels

tsv_path = os.path.join(BASE_DIR, '..', 'scifact', 'qrels', 'test.tsv')
qrels_path = os.path.join(BASE_DIR, 'test.qrels')
convert_tsv_to_qrels(tsv_path, qrels_path)
print(f'Converted {tsv_path} -> {qrels_path}')

Converted c:\Users\dongs\Vector_Space_Model_Based_Information_Retrieval_System_for_the_SciFact_Dataset\IR_Files\..\scifact\qrels\test.tsv -> c:\Users\dongs\Vector_Space_Model_Based_Information_Retrieval_System_for_the_SciFact_Dataset\IR_Files\test.qrels


## Evaluation
Evaluate the `Results` file against `test.qrels` using pytrec_eval.

In [4]:
import pytrec_eval

# Load qrels
qrels = {}
with open('test.qrels', 'r') as f:
    for line in f:
        parts = line.strip().split()
        qid, _, docid, rel = parts
        if qid not in qrels:
            qrels[qid] = {}
        qrels[qid][docid] = int(rel)

# Load results
results = {}
with open('Results', 'r') as f:
    for line in f:
        parts = line.strip().split()
        qid, _, docid, rank, score, tag = parts
        if qid not in results:
            results[qid] = {}
        results[qid][docid] = float(score)

# Filter to queries present in both
qrels_filtered = {qid: docs for qid, docs in qrels.items() if qid in results}
results_filtered = {qid: docs for qid, docs in results.items() if qid in qrels_filtered}

print(f'Queries in qrels: {len(qrels)}')
print(f'Queries in results: {len(results)}')
print(f'Queries evaluated: {len(qrels_filtered)}')
print()

# Evaluate
evaluator = pytrec_eval.RelevanceEvaluator(
    qrels_filtered,
    {'map', 'P_10', 'P_20', 'recip_rank', 'ndcg', 'ndcg_cut_10', 'recall_100'}
)
eval_results = evaluator.evaluate(results_filtered)

# Print averages
metrics = {}
for qid_metrics in eval_results.values():
    for metric, value in qid_metrics.items():
        metrics.setdefault(metric, []).append(value)

print('=== Evaluation Results ===')
for metric in sorted(metrics):
    avg = sum(metrics[metric]) / len(metrics[metric])
    print(f'{metric:20s}: {avg:.4f}')

Queries in qrels: 300
Queries in results: 547
Queries evaluated: 153

=== Evaluation Results ===
P_10                : 0.0948
P_20                : 0.0513
map                 : 0.6021
ndcg                : 0.6832
ndcg_cut_10         : 0.6565
recall_100          : 0.9344
recip_rank          : 0.6196
