In [1]:
import os
import xml.etree.ElementTree as ET
import whoosh.index as index
from whoosh.fields import Schema, TEXT, DATETIME
from whoosh.qparser import QueryParser, MultifieldParser, AndGroup

# Define the schema for your index
schema = Schema(
    title=TEXT(stored=True),
    author=TEXT(stored=True, vector=True),
    journal=TEXT(stored=True),
    year=DATETIME(stored=True),
    url=TEXT(stored=True),
)

# Create or open an index
if not os.path.exists("index"):
    os.mkdir("index")
ix = index.create_in("index", schema)

# Parse the XML file
tree = ET.parse(r"D:\dblp.xml") 
root = tree.getroot()

# Index the data from the XML
writer = ix.writer()
for article in root.findall('article'):
    title = article.find("title").text if article.find("title") is not None else 'Not Specified'
    authors = ', '.join([a.text for a in article.findall("author")]) if article.find("author") is not None else 'Not Specified'
    journal = article.find("journal").text if article.find("journal") is not None else 'Not Specified'
    year = article.find("year").text if article.find("year") is not None else '2000'
    url = article.find("ee").text if article.find("ee") is not None else ""
     
    writer.add_document(title=title, author=authors, journal=journal, year=year,  url=url)

writer.commit()

'# Perform a search\nquery_str = "author:Paul Kocher"\nwith ix.searcher() as searcher:\n    query = QueryParser("author", ix.schema).parse(query_str)\n    results = searcher.search(query)\n    for result in results:\n        print(f"Title: {result[\'title\']}, Author(s): {\', \'.join(result[\'author\'])}, Journal: {result[\'journal\']}, Year: {result[\'year\']}, URL(s): {result[\'url\']}")'

In [3]:
# Open the existing index
ix = index.open_dir("index")
query_str = "Spectre Attacks"
with ix.searcher() as searcher:
    # Create a custom query parser to search all fields, including the author field
    fields = list(ix.schema._fields.keys())
    
    # Use the AndGroup to require all words in the query
    query_parser = MultifieldParser(fields, ix.schema, group=AndGroup).parse(query_str)
    
    results = searcher.search(query_parser)  
    for result in results:
        print(f"Title: {result['title']}, Author(s): {result['author']}, Journal: {result['journal']}, Year: {result['year']}, URL(s): {result['url']}")

Title: Spectre Attacks: Exploiting Speculative Execution., Author(s): Paul Kocher, Daniel Genkin, Daniel Gruss, Werner Haas 0004, Mike Hamburg, Moritz Lipp, Stefan Mangard, Stefan Mangard, Thomas Prescher 0002, Michael Schwarz 0001, Yuval Yarom, Journal: meltdownattack.com, Year: 2018, URL(s): https://spectreattack.com/spectre.pdf
Title: Spectre Attacks: Exploiting Speculative Execution., Author(s): Paul Kocher, Daniel Genkin, Daniel Gruss, Werner Haas 0004, Mike Hamburg, Moritz Lipp, Stefan Mangard, Thomas Prescher 0002, Michael Schwarz 0001, Yuval Yarom, Journal: CoRR, Year: 2018, URL(s): http://arxiv.org/abs/1801.01203
Title: Software Mitigation of RISC-V Spectre Attacks., Author(s): Ruxandra Balucea, Paul Irofti, Journal: CoRR, Year: 2022, URL(s): https://doi.org/10.48550/arXiv.2206.04507
Title: Spectre attacks: exploiting speculative execution., Author(s): Paul Kocher, Jann Horn, Anders Fogh, Daniel Genkin, Daniel Gruss, Werner Haas 0004, Mike Hamburg, Moritz Lipp, Stefan Mangard,