In [1]:
from warcio.archiveiterator import ArchiveIterator

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from whoosh.writing import AsyncWriter

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import re, os
from tqdm import tqdm
from bs4 import BeautifulSoup

if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

schema = Schema(title=ID(stored=True), content=TEXT(stored=True))
ix = create_in("indexdir", schema)


[nltk_data] Downloading package punkt to /home/mayank/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:

def clean_html(content):
    
    soup = BeautifulSoup(content, "html.parser")
    for script in soup(["script", "style"]):
        script.extract()  

    text = soup.get_text(separator=" ")  # Keep words spaced properly
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


## Create Index

In [3]:
def index_file(warc_file):
    numDocs = 0
    with open(warc_file, 'rb') as f, ix.writer() as writer:
        for record in tqdm(ArchiveIterator(f), desc="Extracting WARC documents"):
            if record.rec_type == 'response':  # Extract only web page responses
                url = record.rec_headers.get('WARC-Target-URI')
                content = record.content_stream().read().decode('utf-8', errors='ignore')
                text = clean_html(content)
                
                if len(text) > 100:  # Filter out empty or very short pages
                    try:
                        # writer.add_document(title=url, content=word_tokenize(text.lower()))
                        writer.add_document(title=url, content=text)
                        numDocs += 1
                    except UnicodeEncodeError:
                        print("Found a file with non-unicode chacter! Skipping")
        writer.commit()

    print(f"Extracted {len(numDocs)} documents.")

numDocs = 1 #max number of documents to extract/ read. MAX=32
for i in range(1, numDocs+1): 
    warc_file = f"Dataset/{i}.warc"
    index_file(warc_file)

Extracting WARC documents: 0it [00:00, ?it/s]

Extracting WARC documents: 74389it [41:06, 30.16it/s]


IndexingError: This writer is closed

## Search

In [2]:
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("Something")
    results = searcher.search(query, limit=5)

    for result in results:
        print(f"Title: {result['title']}, Score: {result.score}")


  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [01:24<00:00, 11.83it/s]
100%|██████████| 1000/1000 [01:25<00:00, 11.74it/s]
