In [9]:
from warcio.archiveiterator import ArchiveIterator

from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from whoosh.writing import AsyncWriter

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import re, os
import unicodedata
from tqdm import tqdm
from bs4 import BeautifulSoup

if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

schema = Schema(title=ID(), content=TEXT())
ix = create_in("indexdir", schema)


[nltk_data] Downloading package punkt to /home/mayank/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:

def clean_html(content):
    
    soup = BeautifulSoup(content, "html.parser")
    for script in soup(["script", "style"]):
        script.extract()  

    text = soup.get_text(separator=" ")  # Keep words spaced properly
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def normalize_unicode(text):
    return unicodedata.normalize('NFKC', text)

def safe_tokenize(text):
    text = normalize_unicode(text)
    tokens = word_tokenize(text)
    clean_tokens = [
        tok for tok in tokens 
        if all(unicodedata.category(c)[0] != "C" for c in tok)  # Remove control characters
    ]
    return clean_tokens


## Create Index

In [8]:
def index_file(warc_file):
    numDocs = 0
    writer = ix.writer()
    with open(warc_file, 'rb') as f:
        for record in tqdm(ArchiveIterator(f), desc="Extracting WARC documents"):
            if record.rec_type == 'response':  # Extract only web page responses
                url = record.rec_headers.get('WARC-Target-URI')
                content = record.content_stream().read().decode('utf-8', errors='ignore')
                text = clean_html(content)
                
                if len(text) > 100:  # Filter out empty or very short pages
                    try:
                        # writer.add_document(title=url, content=word_tokenize(text.lower()))
                        writer.add_document(title=url, content=safe_tokenize(text))
                        numDocs += 1
                        # writer.commit()
                    except UnicodeEncodeError:
                        print("Found a file with non-unicode chacter! Skipping")

    print(f"Extracted {numDocs} documents.")

numDocs = 1 #max number of documents to extract/ read. MAX=32
for i in range(1, numDocs+1): 
    warc_file = f"Dataset/{i}.warc"
    index_file(warc_file)

Extracting WARC documents: 74389it [49:05, 25.25it/s]


TypeError: object of type 'int' has no len()

## Search

In [10]:
import whoosh.index as index

ix = index.open_dir("indexdir")
exists = index.exists_in("indexdir")
usages_exists = index.exists_in("indexdir", indexname="usages")
print(exists, usages_exists)

# ix = open_dir("indexdir")
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("some")
    results = searcher.search(query, limit=5)

    for result in results:
        print(f"Title: {result['title']}, Score: {result.score}")


True False
