In [1]:
from warcio.archiveiterator import ArchiveIterator
import gzip
import xml.etree.ElementTree as ET

from whoosh.index import create_in, open_dir, exists_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from whoosh.writing import AsyncWriter
from whoosh import scoring

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import re, os
import unicodedata
from tqdm import tqdm
from bs4 import BeautifulSoup

# from pygaggle.rerank.base import Query, Text
# from pygaggle.rerank.transformer import MonoT5

if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

schema = Schema(docid=TEXT(stored=True), title=ID(stored=True), content=TEXT(stored=True))


[nltk_data] Downloading package punkt to /home/mayank/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Create Index

In [2]:
def index_file(warc_file):
    numDocs = 0
    writer = ix.writer()
    with gzip.open(warc_file, 'rb') as stream:
        # for record in ArchiveIterator(stream):
        for record in tqdm(ArchiveIterator(stream), desc="Extracting WARC documents"):
            if record.rec_type == 'conversion':  # WET files only contain conversion records
                url = record.rec_headers.get('WARC-Target-URI')
                doc_id = record.rec_headers.get('WARC-Record-ID')
                text = record.content_stream().read().decode('utf-8', errors='ignore').strip()

                try:
                    writer.add_document(docid=doc_id, title=url, content=text)
                    numDocs += 1
                except UnicodeEncodeError:
                    print(f"Non-unicode content in doc {doc_id}, skipping...")

    print(f"Extracted {numDocs} documents.")
    writer.commit()

ix = create_in("indexdir", schema, indexname="documents")
numDocs = 5 #max number of documents to extract/ read. MAX=32
for i in range(1, numDocs+1): 
    # warc_file = f"Dataset/{i}.warc"
    warc_file = f"Dataset/{i}.warc.wet.gz"
    index_file(warc_file)

Extracting WARC documents: 37341it [04:05, 152.41it/s]


Extracted 37340 documents.


Extracting WARC documents: 37167it [04:07, 149.90it/s]


Extracted 37166 documents.


Extracting WARC documents: 37532it [04:23, 142.58it/s]


Extracted 37531 documents.


Extracting WARC documents: 37352it [03:40, 169.25it/s]


Extracted 37351 documents.


Extracting WARC documents: 37120it [03:28, 177.63it/s]


Extracted 37119 documents.


## Search

In [5]:
def eval_all_queries(ix, query_file="Dataset/queries/topics/misinfo-2020-topics.xml", top_k=1000):
    # Build paths safely
    query_path = os.path.join(query_file)

    # Parse XML topic file
    tree = ET.parse(query_path)
    root = tree.getroot()

    # Initialize query parser
    qp = QueryParser("content", schema=schema)

    results_all = []
    results_reranker = []

    with ix.searcher(weighting=scoring.BM25F()) as searcher:
        for topic in root.findall("topic"):
            topic_results = []
            topic_reranker = []

            topic_id = topic.find("number").text.strip()
            description = topic.find("description").text.strip()

            parsed_query = qp.parse(description)
            results = searcher.search(parsed_query, limit=top_k)

            for rank, result in enumerate(results):
                trec_entry = {
                    "topic_id": topic_id,
                    "doc_id": result["docid"],
                    "rank": rank + 1,
                    "score": result.score
                }
                topic_results.append(trec_entry)
                results_reranker.append((str(rank), result['content']))
            results_all.append(topic_results)
            results_reranker.append(topic_reranker)

    return results_all, results_reranker

ix = open_dir("indexdir", indexname="documents")
rank_eval, rank_content = eval_all_queries(ix)
print(rank_eval)

[[{'topic_id': '1', 'doc_id': '<urn:uuid:da517e77-1418-4377-8dce-7ea1c28e1472>', 'rank': 1, 'score': 8.444916546473394}, {'topic_id': '1', 'doc_id': '<urn:uuid:b0d88782-d293-4c7f-90b3-a396a288389b>', 'rank': 2, 'score': 7.753888182837189}, {'topic_id': '1', 'doc_id': '<urn:uuid:fac994df-fbcd-4b41-ae77-434bdb590225>', 'rank': 3, 'score': 7.670401978462944}, {'topic_id': '1', 'doc_id': '<urn:uuid:7a4466ea-7dc6-405d-94c5-2cce50d8dc3b>', 'rank': 4, 'score': 7.616834122478771}, {'topic_id': '1', 'doc_id': '<urn:uuid:effa65fe-868c-43c4-bfe4-9862cad09138>', 'rank': 5, 'score': 7.494524469758668}, {'topic_id': '1', 'doc_id': '<urn:uuid:5911d442-daee-4147-8fa9-82b672a204af>', 'rank': 6, 'score': 7.462495917531636}, {'topic_id': '1', 'doc_id': '<urn:uuid:b895d60e-f7f3-4ca8-be26-bff1cec2b8d0>', 'rank': 7, 'score': 7.457166610578453}, {'topic_id': '1', 'doc_id': '<urn:uuid:d2bc9f55-600b-40a1-af89-93de50578bd2>', 'rank': 8, 'score': 7.406224346349593}, {'topic_id': '1', 'doc_id': '<urn:uuid:2024e63

## Reranker

In [4]:
def neural_rerank_all_queries(rank_eval, rank_content):
    reranker = MonoT5()
    reranked_all = []

    for query_results, query_passages in zip(rank_eval, rank_content):
        if not query_results or not query_passages:
            continue

        topic_id = query_results[0]["topic_id"]
        query_text = query_results[0].get("description", topic_id)  # fallback in case description isn't saved

        query = Query(query_text)

        texts = [Text(text=content, metadata={"docid": result["doc_id"]}) 
                 for content, result in zip([p[1] for p in query_passages], query_results)]

        reranked = reranker.rerank(query, texts)

        reranked_topic = []
        for rank, text in enumerate(reranked):
            reranked_topic.append({
                "topic_id": topic_id,
                "doc_id": text.metadata["docid"],
                "rank": rank + 1,
                "score": text.score
            })

        reranked_all.append(reranked_topic)

    return reranked_all
