In [None]:
# Imports
import os
import shutil
import pandas as pd
from datetime import datetime
from lupyne import engine
import lucene

from org.apache.lucene.search.similarities import ClassicSimilarity, BM25Similarity
from org.apache.lucene.analysis.core import WhitespaceAnalyzer, KeywordAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.en import EnglishAnalyzer

if not lucene.getVMEnv():
    lucene.initVM()

# Settings
index_path = "/workspace/index"

FIELD_WEIGHTS = {
    'full_name': 3.25,
    'genre': 3.0,
    'keywords': 2.5,
    'publisher': 2.0,
    'description': 1.0,
    'platform': 1.5,
    'datePublished': 2.5,
    'metascore': 0.8
}

TEXT_FIELDS = ["full_name","genre","datePublished","keywords","publisher","description","platform","wiki_intro",
    "wiki_developer","wiki_producer","wiki_designer","wiki_programmer","wiki_artist",
    "wiki_writer","wiki_composer","wiki_series", "wiki_engine","wiki_platforms"]

ANALYZERS = {
    "standard": StandardAnalyzer(),
    "english": EnglishAnalyzer(),
    "whitespace": WhitespaceAnalyzer(),
    "keyword": KeywordAnalyzer()
}


Nov 26, 2025 3:33:44 PM org.apache.lucene.internal.vectorization.PanamaVectorizationProvider <init>
INFO: Java vector incubator API enabled; uses preferredBitSize=256; FMA enabled


In [2]:

# Load CSV
df = pd.read_csv("/workspace/data/mydata.csv",delimiter="\t")  # adjust path

# ðŸ”¹ Clean previous index (safe)
if os.path.exists(index_path):
    try:
        shutil.rmtree(index_path)
    except PermissionError:
        # manually remove lock if necessary
        lock_file = os.path.join(index_path, "write.lock")
        if os.path.exists(lock_file):
            os.remove(lock_file)
        shutil.rmtree(index_path)


In [None]:

# Choosing specific analyzer
chosen_analyzer = ANALYZERS["standard"]  # try: "standard", "whitespace", "keyword"

# Creating the indexer
indexer = engine.Indexer(index_path, analyzer=chosen_analyzer)

# Define schema
for field in TEXT_FIELDS:
    indexer.set(field, engine.Field.Text, stored=True)

indexer.set("metascore", dimensions=1, stored=True)
indexer.set("wiki_release_date", engine.DateTimeField, stored=True)


<DateTimeField: stored,pointDimensionCount=1,pointIndexDimensionCount=1,pointNumBytes=8>

In [4]:
# Add documents to index with field-level boosting
for i, row in df.iterrows():
    doc = {}
    for field in TEXT_FIELDS:
        value = str(row.get(field, "")).strip()
        if value:
            # Apply field-level boost using tuple (value, boost)
            doc[field] = (value, FIELD_WEIGHTS.get(field, 1.0))

    # NUMERIC FIELD
    try:
        doc["metascore"] = int(row["metascore"])
    except:
        pass
        
    # Add document (Lupyne automatically uses per-field boost)
    indexer.add(doc)

indexer.commit()
indexer.close()

In [5]:

# Build query using Lupyne
Q = engine.Query
query = Q.term("full_name", "witcher") & Q.phrase("description", "wild", "hunt")


In [18]:

# IndexSearcher
searcher = engine.indexers.IndexSearcher(index_path, analyzer=chosen_analyzer)

# TF-IDF
searcher.setSimilarity(ClassicSimilarity())

# BM25
# searcher.setSimilarity(BM25Similarity())


# Ask user for input
user_input = input("Enter your search query: ").strip()
keywords = user_input.split()
print("Searching for:", keywords)

all_fuzzy_hits = []

for term in keywords:          # loop through each keyword
    for field in TEXT_FIELDS:  # try it on all fields
        q = Q.fuzzy(field, term)
        hits = searcher.search(q)[:10]  # top 10 per term-field
        all_fuzzy_hits.extend(hits)

# Remove duplicates based on internal doc id (__id__) and sort by score descending
seen_ids = set()
best_fuzzy_hits = []
for hit in sorted(all_fuzzy_hits, key=lambda h: h.score, reverse=True):
    if hit.id not in seen_ids:
        best_fuzzy_hits.append(hit)
        seen_ids.add(hit.id)

# Boolean query keywords
bool_query = Q.term("full_name",keywords[0]);
for term in keywords:
    for text_field in TEXT_FIELDS:
        bool_query = bool_query | Q.term(text_field, term)
        
bool_hits = searcher.search(bool_query)[:10]

# Display results
print("\n=== Boolean Query: OR terms ===")
if not bool_hits:
    print("No results found")
for hit in bool_hits:
    print(hit["full_name"], "|", hit["publisher"], "|", hit["wiki_developer"],  "| score:", hit.score)
# Display scored hits
print("\n=== Best Fuzzy Hits Across All Fields ===")
if not best_fuzzy_hits:
    print("No results found")
for hit in best_fuzzy_hits[:10]:
    print(hit["full_name"], "|", hit["publisher"], "|", hit["wiki_developer"],  "| score:", hit.score)

Enter your search query:   platformer pixel art retro


Searching for: ['platformer', 'pixel', 'art', 'retro']

=== Boolean Query: OR terms ===
Monster Run. Free pixel-art platformer | Forsbit | nan | score: 9.807929992675781
The Grandmaster | ['PC', 'PC'] | 71 | score: 4.720178127288818
Mazecraft | Liger Games | nan | score: 3.9504916667938232
Sheepy | Eksperimental Games | nan | score: 3.40020751953125
Owlboy | D-Pad Studio | D-Pad Studio | score: 3.3485114574432373
They Bleed Pixels | Spooky Squid Games | nan | score: 3.1163418292999268
Commander Keen | id Software | nan | score: 2.764359951019287
Lorn&#x27;s Lure | Rubeki | nan | score: 2.499866247177124
Captain Kaon | Engage Pixel | nan | score: 2.4012911319732666
Soosiz | Ville Makynen | Touch Foo | score: 2.3547203540802

=== Best Fuzzy Hits Across All Fields ===
Bust a Groove 2 | Enix | Metro | score: 5.1197710037231445
Bust a Groove | 989 Studios | Metro | score: 5.1197710037231445
Heavenly Sword | Sony Computer Entertainment | Ninja Theory | score: 5.109486103057861
Enslaved: Odys