In [1]:
import os, re, math, shlex
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
from nltk.stem import PorterStemmer

In [2]:
TREC_FILE = "trec.5000.xml"
STOPWORDS_FILE = "stopwords.txt"
BOOLEAN_QUERIES_FILE = "queries.boolean.txt"
RANKED_QUERIES_FILE = "queries.ranked.txt"
RESULTS_FOLDER = "results"
BOOLEAN_RESULTS_FILE = os.path.join(RESULTS_FOLDER, "results.boolean.txt")
RANKED_RESULTS_FILE = os.path.join(RESULTS_FOLDER, "results.ranked.txt")
INDEX_FILE = os.path.join(RESULTS_FOLDER, "index.txt")

In [3]:
USE_STOPWORDS = True   
USE_STEMMING = True
PROXIMITY_DEFAULT_K = 30
TOP_K_RANKED = 150

In [4]:
os.makedirs(RESULTS_FOLDER, exist_ok=True)
ps = PorterStemmer()
print("Configuration loaded. Stopwords enabled:", USE_STOPWORDS)

Configuration loaded. Stopwords enabled: True


In [5]:
def load_stopwords(path):
    if not os.path.exists(path):
        return set()
    with open(path, 'r', encoding='utf-8') as f:
        return set(w.strip().lower() for w in f if w.strip())

STOPWORDS = load_stopwords(STOPWORDS_FILE)

def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

def preprocess(tokens):
    if USE_STOPWORDS:
        tokens = [t for t in tokens if t not in STOPWORDS]
    if USE_STEMMING:
        tokens = [ps.stem(t) for t in tokens]
    return tokens


In [6]:
#LOADING THE XML FILE
def load_trec_docs(path):
    tree = ET.parse(path)
    root = tree.getroot()
    docs = {}
    for doc in root.findall('.//DOC'):
        docno = doc.find('DOCNO').text.strip()
        headline = doc.find('HEADLINE').text.strip() if doc.find('HEADLINE') is not None and doc.find('HEADLINE').text else ''
        text = doc.find('TEXT').text.strip() if doc.find('TEXT') is not None and doc.find('TEXT').text else ''
        content = f"{headline} {text}".strip()
        docs[docno] = content
    print(f"Loaded {len(docs)} documents from {path}")
    return docs

docs = load_trec_docs(TREC_FILE)


Loaded 5000 documents from trec.5000.xml


In [7]:
#POSITIONAL INVERTED INDEX
def build_index(docs):
    index = defaultdict(lambda: defaultdict(list))
    for docno, content in docs.items():
        tokens = preprocess(tokenize(content))
        for pos, term in enumerate(tokens, start=1):
            index[term][docno].append(pos)
    return index

def save_index(index):
    with open(INDEX_FILE, "w", encoding="utf-8") as f:
        for term in sorted(index.keys()):
            posting = index[term]
            f.write(f"{term}:{len(posting)}\n")
            for docid, positions in posting.items():
                f.write(f"\t{docid}: {', '.join(map(str, positions))}\n")
            f.write("\n")
    print("Index saved to", INDEX_FILE)

index = build_index(docs)
save_index(index)
print("Index built with", len(index), "unique terms.")


Index saved to results\index.txt
Index built with 39308 unique terms.


In [8]:
#PHRASE SEARCH
def phrase_search(phrase, index):
    tokens = preprocess(tokenize(phrase))
    if not tokens or tokens[0] not in index:
        return []
    results = []
    for doc in index[tokens[0]]:
        for pos in index[tokens[0]][doc]:
            if all((tokens[i] in index and (pos+i) in index[tokens[i]][doc]) for i in range(1, len(tokens))):
                results.append(doc)
                break
    return sorted(results)



In [9]:
#PROXIMITY SEARCH
def proximity_search(t1, t2, k, index):
    t1, t2 = preprocess([t1, t2])
    if t1 not in index or t2 not in index:
        return []
    results = []
    for doc in set(index[t1]) & set(index[t2]):
        for p1 in index[t1][doc]:
            if any(abs(p1 - p2) <= k for p2 in index[t2][doc]):
                results.append(doc)
                break
    return sorted(results)

In [10]:
#BOOLEAN SEARCH
def boolean_search(query, index):
    query = query.replace("AND", "&").replace("OR", "|").replace("NOT", "-")
    phrase = re.findall(r'"(.*?)"', query)
    if phrase:
        return set(phrase_search(phrase[0], index))
    prox = re.findall(r"#(\d+)\(([^,]+),([^\)]+)\)", query)
    if prox:
        k, t1, t2 = int(prox[0][0]), prox[0][1].strip(), prox[0][2].strip()
        return set(proximity_search(t1, t2, k, index))
    terms = preprocess(tokenize(query))
    results = [set(index[t].keys()) for t in terms if t in index]
    return set.intersection(*results) if results else set()


In [11]:
#RANKED RETRIEVAL (TF-IDF)
def compute_tfidf(index, N):
    idf = {t: math.log10(N / len(index[t])) for t in index if len(index[t]) > 0}
    doc_vecs = defaultdict(dict)
    for t, posting in index.items():
        for d, positions in posting.items():
            tf = len(positions)
            doc_vecs[d][t] = (1 + math.log10(tf)) * idf[t]
    norms = {d: math.sqrt(sum(v*v for v in vec.values())) for d, vec in doc_vecs.items()}
    return idf, doc_vecs, norms

def ranked_search(query, index, idf, doc_vecs, norms):
    tokens = preprocess(tokenize(query))
    q_tf = Counter(tokens)
    q_vec = {t: (1 + math.log10(f)) * idf.get(t, 0) for t, f in q_tf.items()}
    q_norm = math.sqrt(sum(v*v for v in q_vec.values()))
    scores = {}
    for d in doc_vecs:
        dot = sum(q_vec.get(t,0)*doc_vecs[d].get(t,0) for t in q_vec)
        denom = norms[d]*q_norm
        if denom > 0:
            scores[d] = dot/denom
    return sorted(scores.items(), key=lambda x:x[1], reverse=True)[:TOP_K_RANKED]



In [12]:
def main():
    print("\nRunning IR system...")
    idf, doc_vecs, norms = compute_tfidf(index, len(docs))

    #BOOLEAN & PHRASE & PROXIMITY 
    with open(BOOLEAN_QUERIES_FILE, 'r', encoding='utf-8') as f:
        queries = [line.strip() for line in f if line.strip()]

    with open(BOOLEAN_RESULTS_FILE, 'w', encoding='utf-8') as out:
        for qline in queries:
            qnum, *qterms = qline.split()
            qtext = " ".join(qterms)
            results = boolean_search(qtext, index)
            for doc in sorted(results):
                out.write(f"{qnum},{doc}\n")
    print(f"Results saved to {BOOLEAN_RESULTS_FILE}")

    #RANKED RETRIEVAL (TF-IDF)
    with open(RANKED_QUERIES_FILE, 'r', encoding='utf-8') as f:
        queries = [line.strip() for line in f if line.strip()]

    with open(RANKED_RESULTS_FILE, 'w', encoding='utf-8') as out:
        for qline in queries:
            qnum, *qterms = qline.split()
            qtext = " ".join(qterms)
            ranked = ranked_search(qtext, index, idf, doc_vecs, norms)
            for doc, score in ranked:
                out.write(f"{qnum},{doc},{score:.4f}\n")
    print(f"Results saved to {RANKED_RESULTS_FILE}")

if __name__ == "__main__":
    main()



Running IR system...
Results saved to results\results.boolean.txt
Results saved to results\results.ranked.txt
