In [1]:
from warcio.archiveiterator import ArchiveIterator
import gzip
import xml.etree.ElementTree as ET

from whoosh.index import create_in, open_dir, exists_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from whoosh.writing import AsyncWriter
from whoosh import scoring
from whoosh.analysis import StemmingAnalyzer, FancyAnalyzer

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import re, os
import unicodedata
from tqdm import tqdm
from bs4 import BeautifulSoup

# from pygaggle.rerank.base import Query, Text
# from pygaggle.rerank.transformer import MonoT5

if not os.path.exists("indexdir2"):
    os.mkdir("indexdir2")

# schema = Schema(docid=ID(stored=True), title=ID(stored=True), content=TEXT(stored=True, analyzer=FancyAnalyzer()))
schema = Schema(docid=ID(stored=True), title=ID(stored=True), content=TEXT(stored=True))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mamay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Create Index

In [2]:
def index_file(warc_file):
    numDocs = 0
    writer = ix.writer()
    with gzip.open(warc_file, 'rb') as stream:
        # for record in ArchiveIterator(stream):
        for record in tqdm(ArchiveIterator(stream), desc="Extracting WARC documents"):
            # if numDocs >= 1000: 
            #     break
            if record.rec_type == 'conversion':  # WET files only contain conversion records
                url = record.rec_headers.get('WARC-Target-URI')
                doc_id = record.rec_headers.get('WARC-Record-ID')
                # text = word_tokenize(record.content_stream().read().decode('utf-8', errors='ignore').strip())
                text = [t.text for t in StemmingAnalyzer()(record.content_stream().read().decode('utf-8', errors='ignore').strip())]

                try:
                    writer.add_document(docid=doc_id, title=url, content=text)
                    numDocs += 1
                except UnicodeEncodeError:
                    print(f"Non-unicode content in doc {doc_id}, skipping...")

    print(f"Extracted {numDocs} documents.")
    writer.commit()

ix = create_in("indexdir2", schema, indexname="documents")
numDocs = 5 #max number of documents to extract/ read. MAX=32
for i in range(1, numDocs+1): 
    # warc_file = f"Dataset/{i}.warc"
    warc_file = f"Dataset/{i}.warc.wet.gz"
    index_file(warc_file)

Extracting WARC documents: 37341it [06:32, 95.19it/s] 


Extracted 37340 documents.


Extracting WARC documents: 37167it [06:56, 89.26it/s] 


Extracted 37166 documents.


Extracting WARC documents: 37532it [07:09, 87.31it/s] 


Extracted 37531 documents.


Extracting WARC documents: 37352it [06:52, 90.62it/s] 


Extracted 37351 documents.


Extracting WARC documents: 37120it [06:41, 92.39it/s] 


Extracted 37119 documents.


## Search

In [3]:
def eval_all_queries(ix, query_file="Dataset/queries/topics/misinfo-2020-topics.xml", top_k=10):
    # Build paths safely
    query_path = os.path.join(query_file)

    # Parse XML topic file
    tree = ET.parse(query_path)
    root = tree.getroot()

    # Initialize query parser
    qp = QueryParser("content", schema=schema)

    results_all = []
    results_reranker = []

    with ix.searcher(weighting=scoring.BM25F()) as searcher:
        for topic in root.findall("topic"):
            topic_results = []
            topic_reranker = []

            topic_id = topic.find("number").text.strip()
            description = topic.find("description").text.strip()

            # parsed_query = qp.parse(description)
            # parsed_query = qp.parse(word_tokenize(description))
            print([t.text for t in StemmingAnalyzer()(description)])
            print(description)
            parsed_query = qp.parse(str([t.text for t in StemmingAnalyzer()(description)]))
            print(parsed_query)
            results = searcher.search(parsed_query, limit=top_k)

            for rank, result in enumerate(results):
                trec_entry = {
                    "topic_id": topic_id,
                    "doc_id": result["docid"],
                    "rank": rank + 1,
                    "score": result.score
                }
                topic_results.append(trec_entry)
                results_reranker.append((str(rank), result['content']))
            results_all.append(topic_results)
            print(topic_results)
            results_reranker.append(topic_reranker)

    return results_all, results_reranker

ix = open_dir("indexdir2", indexname="documents")
rank_eval, rank_content = eval_all_queries(ix)
print(rank_eval)

['vitamin', 'cure', 'covid', '19']
Can vitamin D cure COVID-19?
(content:vitamin AND content:cure AND content:covid AND content:19)
[]
['vitamin', 'cure', 'covid', '19']
Can vitamin C cure COVID-19?
(content:vitamin AND content:cure AND content:covid AND content:19)
[]
['bcg', 'vaccin', 'prevent', 'covid', '19']
Can BCG vaccine prevent COVID-19?
(content:bcg AND content:vaccin AND content:prevent AND content:covid AND content:19)
[]
['ibuprofen', 'worsen', 'covid', '19']
Can ibuprofen worsen COVID-19?
(content:ibuprofen AND content:worsen AND content:covid AND content:19)
[]
['gargl', 'salt', 'water', 'prevent', 'covid', '19']
Can gargling salt water prevent COVID-19?
(content:gargl AND content:salt AND content:water AND content:prevent AND content:covid AND content:19)
[]
['ginger', 'cure', 'covid', '19']
Can Ginger cure COVID-19?
(content:ginger AND content:cure AND content:covid AND content:19)
[]
['5g', 'antenna', 'caus', 'covid', '19']
Can 5G antennas cause COVID-19?
(content:5g A

In [4]:
ix = open_dir("indexdir", indexname="documents")
with ix.searcher() as searcher:
    docnum = 5  # change this to the document number you want
    stored_fields = searcher.stored_fields(docnum)
    print("Stored fields in document:", stored_fields)

Stored fields in document: {'content': ['20', 'minute', 'full', 'body', 'hiit', 'cardio', 'workout', 'self', 'skip', 'main', 'content', 'open', 'navigation', 'menu', 'menu', 'full', 'body', 'hiit', 'cardio', 'workout', 'no', 'equipment', 'fitness', 'food', 'health', 'love', 'beauty', 'culture', 'more', 'chevron', 'search', 'search', 'fitness', 'workouts', 'shape', 'up', 'running', 'yoga', 'food', 'healthy', 'eating', 'nutrition', 'weight', 'loss', 'recipes', 'cooking', 'health', 'mental', 'health', 'sexual', 'reproductive', 'health', 'pregnancy', 'motherhood', 'sleep', 'love', 'sex', 'relationships', 'weddings', 'single', 'life', 'breakups', 'beauty', 'makeup', 'hair', 'fashion', 'nails', 'skin', 'culture', 'career', 'money', 'travel', 'entertainment', 'technology', 'family', 'sports', 'politics', 'fitness', 'december', '31', '2019', 'full', 'body', 'hiit', 'cardio', 'workout', 'no', 'equipment', 'goal', 'today’s', 'workout', 'simple', 'get', 'breathless', 'amy', 'eisinger', 'facebook'

## Reranker

In [5]:
def neural_rerank_all_queries(rank_eval, rank_content):
    reranker = MonoT5()
    reranked_all = []

    for query_results, query_passages in zip(rank_eval, rank_content):
        if not query_results or not query_passages:
            continue

        topic_id = query_results[0]["topic_id"]
        query_text = query_results[0].get("description", topic_id)  # fallback in case description isn't saved

        query = Query(query_text)

        texts = [Text(text=content, metadata={"docid": result["doc_id"]}) 
                 for content, result in zip([p[1] for p in query_passages], query_results)]

        reranked = reranker.rerank(query, texts)

        reranked_topic = []
        for rank, text in enumerate(reranked):
            reranked_topic.append({
                "topic_id": topic_id,
                "doc_id": text.metadata["docid"],
                "rank": rank + 1,
                "score": text.score
            })

        reranked_all.append(reranked_topic)

    return reranked_all
