In [None]:
# Imports
import os
import shutil
import pandas as pd
from lupyne import engine
import lucene

from org.apache.lucene.search.similarities import ClassicSimilarity, BM25Similarity
from org.apache.lucene.analysis.core import WhitespaceAnalyzer, KeywordAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.en import EnglishAnalyzer

if not lucene.getVMEnv():
    lucene.initVM()

# Settings
index_path = "/workspace/index"

ANALYZERS = {
    "standard": StandardAnalyzer(),
    "english": EnglishAnalyzer(),
    "whitespace": WhitespaceAnalyzer(),
    "keyword": KeywordAnalyzer()
}

# Field weights for boosting during search
FIELD_WEIGHTS = {
    'full_name': 4.0,
    'genre': 3.0,
    'keywords': 2.5,
    'description': 1.5,
    'intro_text': 1.5,
    'publisher': 2.0,
    'platform': 1.5,
}

print("✓ Configuration loaded")

Nov 26, 2025 3:33:44 PM org.apache.lucene.internal.vectorization.PanamaVectorizationProvider <init>
INFO: Java vector incubator API enabled; uses preferredBitSize=256; FMA enabled


In [None]:
# Load CSV - auto-detect all columns
csv_path = "/workspace/data/mydata.csv"
df = pd.read_csv(csv_path, delimiter=",", encoding="utf-8", on_bad_lines='skip')

# Get all column names dynamically
ALL_FIELDS = list(df.columns)

print(f"Loaded {len(df)} records")
print(f"Columns ({len(ALL_FIELDS)}): {ALL_FIELDS}")

# Preview data
df.head(3)

In [None]:
# Clean previous index
if os.path.exists(index_path):
    try:
        shutil.rmtree(index_path)
    except PermissionError:
        lock_file = os.path.join(index_path, "write.lock")
        if os.path.exists(lock_file):
            os.remove(lock_file)
        shutil.rmtree(index_path)

# Choose analyzer
chosen_analyzer = ANALYZERS["standard"]

# Create indexer
indexer = engine.Indexer(index_path, analyzer=chosen_analyzer)

# URL fields - stored but NOT indexed (not searchable)
URL_FIELDS = {'url', 'wiki_url'}

# Define schema
for field in ALL_FIELDS:
    if field in URL_FIELDS:
        # URL: stored only, not searchable
        indexer.set(field, engine.Field.String, stored=True)
    else:
        # Other fields: full-text searchable and stored
        indexer.set(field, engine.Field.Text, stored=True)

print(f"✓ Index schema configured with {len(ALL_FIELDS)} fields")
print(f"  - URL fields (stored only, not indexed): {URL_FIELDS}")

<DateTimeField: stored,pointDimensionCount=1,pointIndexDimensionCount=1,pointNumBytes=8>

In [None]:
# Index all documents - simple approach, no special parsing
indexed_count = 0

for i, row in df.iterrows():
    doc = {}
    
    # Index ALL fields
    for field in ALL_FIELDS:
        value = row.get(field, "")
        if pd.notna(value) and str(value).strip():
            value = str(value).strip()
            
            if field in URL_FIELDS:
                # URL fields: stored only, no boost (not searchable)
                doc[field] = value
            else:
                # Other fields: searchable with boost
                boost = FIELD_WEIGHTS.get(field, 1.0)
                doc[field] = (value, boost)
    
    # Index if document has any content
    if doc:
        indexer.add(doc)
        indexed_count += 1
    
    if (i + 1) % 10000 == 0:
        print(f"  Processed {i + 1} rows...")

indexer.commit()
indexer.close()

print(f"\n✓ Indexing complete!")
print(f"  Total indexed: {indexed_count:,}")
print(f"  URL fields stored but not searchable")

In [None]:
# =============================================================================
# SEARCH ENGINE - Klasický a Boolean search
# =============================================================================
# Podporované typy vyhľadávania:
#   - Term search: jednotlivé slová
#   - Boolean search: AND, OR operátory
#   - Field-specific: genre:rpg, publisher:electronic
#   - Range queries: metascore:>80, year:2019-2023
# =============================================================================

import re

Q = engine.Query

def extract_year_from_string(value):
    """Extrahuje 4-ciferný rok z reťazca."""
    if not value:
        return None
    match = re.search(r'\b(19\d{2}|20\d{2})\b', str(value))
    return int(match.group(1)) if match else None

def extract_number_from_string(value):
    """Extrahuje číslo z reťazca."""
    if not value:
        return None
    try:
        return int(float(str(value).strip()))
    except:
        match = re.search(r'(-?\d+)', str(value))
        return int(match.group(1)) if match else None

def parse_range(value_str):
    """Parsuje range string ako '80-100', '>75', '<50', '2019'."""
    value_str = str(value_str).strip()
    
    if value_str.startswith('>='):
        try:
            return int(value_str[2:]), 999999
        except:
            return None, None
    
    if value_str.startswith('<='):
        try:
            return -999999, int(value_str[2:])
        except:
            return None, None
    
    if value_str.startswith('>'):
        try:
            return int(value_str[1:]) + 1, 999999
        except:
            return None, None
    
    if value_str.startswith('<'):
        try:
            return -999999, int(value_str[1:]) - 1
        except:
            return None, None
    
    if '-' in value_str and not value_str.startswith('-'):
        parts = value_str.split('-')
        if len(parts) == 2:
            try:
                return int(parts[0]), int(parts[1])
            except:
                return None, None
    
    try:
        val = int(value_str)
        return val, val
    except:
        return None, None

def parse_query(query_string):
    """
    Parsuje query string do textových vyhľadávaní a filtrov.
    
    Podporuje:
      - keywords: witcher rpg (implicitný OR)
      - field:value: genre:rpg publisher:electronic
      - range: metascore:>80 year:2015-2020
    """
    tokens = query_string.strip().split()
    
    text_queries = []      # [(field_or_none, term), ...]
    range_filters = []     # [(field, min, max, extractor_func), ...]
    filters_info = []
    
    YEAR_FIELDS = ['year', 'datepublished', 'date', 'released']
    NUMBER_FIELDS = ['metascore', 'score']
    
    for token in tokens:
        if ':' in token:
            field, value = token.split(':', 1)
            field_lower = field.lower()
            
            min_val, max_val = parse_range(value)
            
            if min_val is not None:
                if field_lower in YEAR_FIELDS:
                    range_filters.append((field_lower, min_val, max_val, extract_year_from_string))
                    filters_info.append(f"{field}: {min_val}-{max_val}")
                elif field_lower in NUMBER_FIELDS:
                    range_filters.append((field_lower, min_val, max_val, extract_number_from_string))
                    filters_info.append(f"{field}: {min_val}-{max_val}")
                else:
                    text_queries.append((field, value.lower()))
                    filters_info.append(f"{field}: {value}")
            else:
                text_queries.append((field, value.lower()))
                filters_info.append(f"{field}: {value}")
        else:
            text_queries.append((None, token.lower()))
    
    return text_queries, range_filters, filters_info


def build_boolean_query(text_queries, use_and=False):
    """
    Vytvorí Boolean query z jednotlivých term queries.
    
    Args:
        text_queries: [(field_or_none, term), ...]
        use_and: True = AND (všetky musia matchovať), False = OR (aspoň jeden)
    
    Returns:
        Lucene BooleanQuery alebo None
    """
    from org.apache.lucene.search import BooleanQuery, BooleanClause
    
    builder = BooleanQuery.Builder()
    has_clauses = False
    
    for field, term in text_queries:
        if field:
            # Field-specific search
            matching_fields = [f for f in ALL_FIELDS if f.lower() == field.lower()]
            fields_to_search = matching_fields if matching_fields else [field]
        else:
            # Search all fields
            fields_to_search = ALL_FIELDS
        
        # Pre každý term vytvoríme sub-query cez všetky relevantné polia (OR)
        field_builder = BooleanQuery.Builder()
        for f in fields_to_search:
            try:
                # Klasický term query (presná zhoda po analýze)
                term_query = Q.term(f, term)
                field_builder.add(term_query, BooleanClause.Occur.SHOULD)
            except:
                pass
        
        field_query = field_builder.build()
        if field_query.clauses():
            # Pridáme do hlavnej query
            occur = BooleanClause.Occur.MUST if use_and else BooleanClause.Occur.SHOULD
            builder.add(field_query, occur)
            has_clauses = True
    
    return builder.build() if has_clauses else None


def execute_search(query_string, searcher, max_results=20, boolean_and=False):
    """
    Vykoná vyhľadávanie s podporou klasického term search a boolean queries.
    
    Args:
        query_string: Dotaz od používateľa
        searcher: IndexSearcher
        max_results: Maximálny počet výsledkov
        boolean_and: True = AND mode (všetky termy musia matchovať)
                     False = OR mode (default, aspoň jeden term)
    
    Returns:
        (hits, filters_info)
    """
    text_queries, range_filters, filters_info = parse_query(query_string)
    
    all_hits = []
    
    # Vykonaj textové vyhľadávanie
    if text_queries:
        boolean_query = build_boolean_query(text_queries, use_and=boolean_and)
        if boolean_query:
            hits = searcher.search(boolean_query, count=max_results * 3)
            all_hits.extend(hits)
    
    # Ak sú len range filtre (bez textu), získaj všetky dokumenty
    if not text_queries and range_filters:
        all_hits = list(searcher.search(Q.alldocs(), count=50000))
    
    # Deduplikácia podľa doc id, zachovaj najvyššie skóre
    seen = {}
    for hit in all_hits:
        doc_id = hit.id
        if doc_id not in seen or hit.score > seen[doc_id].score:
            seen[doc_id] = hit
    
    # Aplikuj range filtre (post-search filtering)
    filtered_hits = []
    for hit in seen.values():
        include = True
        
        for field_name, min_val, max_val, extractor in range_filters:
            value = None
            for f in ALL_FIELDS:
                if f.lower() == field_name.lower() or (field_name == 'year' and 'date' in f.lower()):
                    value = hit.get(f)
                    if value:
                        break
            
            if value:
                extracted = extractor(value)
                if extracted is None or not (min_val <= extracted <= max_val):
                    include = False
                    break
            else:
                include = False
                break
        
        if include:
            filtered_hits.append(hit)
    
    # Zoraď podľa skóre zostupne
    filtered_hits.sort(key=lambda h: h.score, reverse=True)
    
    return filtered_hits[:max_results], filters_info


print("✓ Search engine ready (Classic Term + Boolean Search)")
print("""
Query syntax:
  • Keywords (OR):      witcher rpg        - aspoň jeden term
  • Keywords (AND):     použite boolean_and=True v execute_search
  • Field search:       genre:rpg  publisher:electronic  platform:pc
  • Metascore range:    metascore:>80  metascore:70-90  metascore:<50
  • Year range:         year:2019  year:2015-2020  year:>2018
  • Combined:           witcher genre:rpg year:>2015 metascore:>70

Typ vyhľadávania:
  • Term Search - presná zhoda tokenov po analýze (StandardAnalyzer)
  • Boolean OR  - default, dokument matchuje ak obsahuje aspoň jeden term
  • Boolean AND - všetky termy musia byť v dokumente
""")

In [None]:
# =============================================================================
# INTERAKTÍVNE VYHĽADÁVANIE
# =============================================================================

searcher = engine.indexers.IndexSearcher(index_path, analyzer=chosen_analyzer)

# Nastavenie similarity (TF-IDF alebo BM25)
searcher.setSimilarity(ClassicSimilarity())  # Classic TF-IDF
# searcher.setSimilarity(BM25Similarity())   # Alternatíva: BM25

print("="*80)
print("GAME SEARCH ENGINE - Classic Term & Boolean Search")
print("="*80)
print("""
Query examples:
  • witcher                           - Single keyword
  • apex legends shooter              - Multiple keywords (OR mode)
  • genre:action                      - Field-specific search
  • metascore:>80                     - Score greater than 80
  • year:2019-2023                    - Games from 2019-2023
  • witcher genre:rpg metascore:>70   - Combined query

Search type: Classic Term Search + Boolean OR (default)
""")
print("="*80)

# Vstup od používateľa
user_input = input("Enter search query: ").strip()

if user_input:
    # boolean_and=False = OR mode (default)
    # boolean_and=True  = AND mode (všetky termy musia matchovať)
    hits, filters_info = execute_search(user_input, searcher, max_results=15, boolean_and=False)
    
    print(f"\nSearch: '{user_input}'")
    if filters_info:
        print(f"Filters: {', '.join(filters_info)}")
    print(f"Results: {len(hits)}")
    print("-"*80)
    
    if not hits:
        print("No results found. Try broader search terms or check spelling.")
    else:
        for i, hit in enumerate(hits, 1):
            name = hit.get("full_name", "Unknown")
            publisher = hit.get("publisher", "-")
            genre = hit.get("genre", "-")
            metascore = hit.get("metascore", "-")
            date = hit.get("datePublished", "-")
            url = hit.get("url") or hit.get("wiki_url") or "-"
            
            if len(str(genre)) > 35:
                genre = str(genre)[:35] + "..."
            if len(str(publisher)) > 20:
                publisher = str(publisher)[:20] + "..."
            if len(str(date)) > 15:
                date = str(date)[:15] + "..."
            
            print(f"{i:2}. {name}")
            print(f"    Genre: {genre}")
            print(f"    Publisher: {publisher} | Date: {date} | Metascore: {metascore}")
            print(f"    Score: {hit.score:.2f}")
            if url != "-":
                print(f"    URL: {url[:60]}...")
            print()
else:
    print("No query entered.")

Enter your search query:   platformer pixel art retro


Searching for: ['platformer', 'pixel', 'art', 'retro']

=== Boolean Query: OR terms ===
Monster Run. Free pixel-art platformer | Forsbit | nan | score: 9.807929992675781
The Grandmaster | ['PC', 'PC'] | 71 | score: 4.720178127288818
Mazecraft | Liger Games | nan | score: 3.9504916667938232
Sheepy | Eksperimental Games | nan | score: 3.40020751953125
Owlboy | D-Pad Studio | D-Pad Studio | score: 3.3485114574432373
They Bleed Pixels | Spooky Squid Games | nan | score: 3.1163418292999268
Commander Keen | id Software | nan | score: 2.764359951019287
Lorn&#x27;s Lure | Rubeki | nan | score: 2.499866247177124
Captain Kaon | Engage Pixel | nan | score: 2.4012911319732666
Soosiz | Ville Makynen | Touch Foo | score: 2.3547203540802

=== Best Fuzzy Hits Across All Fields ===
Bust a Groove 2 | Enix | Metro | score: 5.1197710037231445
Bust a Groove | 989 Studios | Metro | score: 5.1197710037231445
Heavenly Sword | Sony Computer Entertainment | Ninja Theory | score: 5.109486103057861
Enslaved: Odys

In [None]:
# =============================================================================
# DEMO VYHĽADÁVANIA - Classic Term & Boolean Search
# =============================================================================

def demo_search(query, searcher, limit=5, boolean_and=False):
    """Vykoná demo search a zobrazí výsledky."""
    mode = "AND" if boolean_and else "OR"
    print(f"\n{'='*60}")
    print(f"Query: {query} (Boolean {mode})")
    print('='*60)
    
    hits, filters = execute_search(query, searcher, max_results=limit, boolean_and=boolean_and)
    
    if filters:
        print(f"Filters: {', '.join(filters)}")
    print(f"Found: {len(hits)} results\n")
    
    for i, hit in enumerate(hits, 1):
        name = hit.get("full_name", "Unknown")
        genre = str(hit.get("genre", "-"))[:25]
        metascore = hit.get("metascore", "-")
        date = str(hit.get("datePublished", "-"))[:12]
        print(f"  {i}. {name}")
        print(f"     {genre} | {date} | score:{metascore}")
    
    if not hits:
        print("  No results found")

# Spustenie demo vyhľadávaní
print("\n" + "="*60)
print("DEMO SEARCHES - Classic Term & Boolean")
print("="*60)

# OR mode (default) - aspoň jeden term
demo_search("witcher", searcher)
demo_search("genre:shooter", searcher)
demo_search("metascore:>80", searcher)
demo_search("year:2019-2021", searcher)
demo_search("publisher:electronic genre:shooter", searcher)

# AND mode - všetky termy musia matchovať
demo_search("apex legends", searcher, boolean_and=True)

# Porovnanie OR vs AND
print("\n" + "="*60)
print("POROVNANIE: OR vs AND mode")
print("="*60)
demo_search("dark souls", searcher, boolean_and=False)  # OR - nájde dark ALEBO souls
demo_search("dark souls", searcher, boolean_and=True)   # AND - nájde dark A souls

In [None]:
# =============================================================================
# EVALUATION METRICS - Hodnotenie úspešnosti IR systému
# =============================================================================
# Detailný popis všetkých metrík sa nachádza v DOKUMENTACIA.md (sekcia 5.1)
# =============================================================================

import numpy as np

# -----------------------------------------------------------------------------
# 1. SET-BASED METRIKY (nezohľadňujú poradie)
# -----------------------------------------------------------------------------

def calculate_precision(retrieved_set, relevant_set):
    """Precision = |Retrieved ∩ Relevant| / |Retrieved|"""
    if not retrieved_set:
        return 0.0
    tp = len(retrieved_set & relevant_set)
    return tp / len(retrieved_set)


def calculate_recall(retrieved_set, relevant_set):
    """Recall = |Retrieved ∩ Relevant| / |Relevant|"""
    if not relevant_set:
        return 0.0
    tp = len(retrieved_set & relevant_set)
    return tp / len(relevant_set)


def calculate_f1(precision, recall):
    """F1 = 2 * (Precision * Recall) / (Precision + Recall)"""
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


def calculate_accuracy(retrieved_set, relevant_set, total_docs):
    """Accuracy = (TP + TN) / Total"""
    tp = len(retrieved_set & relevant_set)
    fp = len(retrieved_set - relevant_set)
    fn = len(relevant_set - retrieved_set)
    tn = total_docs - tp - fp - fn
    return (tp + tn) / total_docs if total_docs > 0 else 0.0


# -----------------------------------------------------------------------------
# 2. RANK-AWARE METRIKY (zohľadňujú poradie)
# -----------------------------------------------------------------------------

def calculate_precision_at_k(ranked_results, relevant_set, k):
    """P@K = |Retrieved[:K] ∩ Relevant| / K"""
    if k == 0:
        return 0.0
    top_k = set(ranked_results[:k])
    return len(top_k & relevant_set) / k


def calculate_average_precision(ranked_results, relevant_set):
    """AP = (1/|R|) * Σ P(k) * rel(k)"""
    if not relevant_set:
        return 0.0
    
    sum_precision = 0.0
    relevant_count = 0
    
    for i, doc in enumerate(ranked_results):
        if doc in relevant_set:
            relevant_count += 1
            precision_at_i = relevant_count / (i + 1)
            sum_precision += precision_at_i
    
    return sum_precision / len(relevant_set)


def calculate_dcg(ranked_results, relevant_set, k=None):
    """DCG@K = Σ rel(i) / log₂(i + 1)"""
    if k is None:
        k = len(ranked_results)
    
    dcg = 0.0
    for i, doc in enumerate(ranked_results[:k]):
        if doc in relevant_set:
            dcg += 1.0 / np.log2(i + 2)  # i+2 because i is 0-indexed
    
    return dcg


def calculate_idcg(relevant_set, k=None):
    """IDCG@K - ideálne DCG (všetky relevantné na začiatku)"""
    if k is None:
        k = len(relevant_set)
    
    num_relevant = min(k, len(relevant_set))
    
    idcg = 0.0
    for i in range(num_relevant):
        idcg += 1.0 / np.log2(i + 2)
    
    return idcg


def calculate_ndcg(ranked_results, relevant_set, k=None):
    """NDCG@K = DCG@K / IDCG@K"""
    if k is None:
        k = len(ranked_results)
    
    dcg = calculate_dcg(ranked_results, relevant_set, k)
    idcg = calculate_idcg(relevant_set, k)
    
    if idcg == 0:
        return 0.0
    
    return dcg / idcg


# -----------------------------------------------------------------------------
# 3. POMOCNÉ FUNKCIE
# -----------------------------------------------------------------------------

def get_confusion_matrix(retrieved_set, relevant_set, total_docs):
    """Vráti TP, FP, FN, TN"""
    tp = len(retrieved_set & relevant_set)
    fp = len(retrieved_set - relevant_set)
    fn = len(relevant_set - retrieved_set)
    tn = total_docs - tp - fp - fn
    return tp, fp, fn, tn


def print_confusion_matrix(tp, fp, fn, tn):
    """Zobrazí confusion matrix"""
    print("\n┌─────────────────────────────────────────┐")
    print("│           CONFUSION MATRIX              │")
    print("├─────────────────────────────────────────┤")
    print("│                  Predicted              │")
    print("│              Relevant  Not-Rel          │")
    print(f"│ Actual Rel     {tp:4d}     {fn:4d}            │")
    print(f"│        Not-Rel {fp:4d}     {tn:4d}            │")
    print("└─────────────────────────────────────────┘")


# -----------------------------------------------------------------------------
# 4. HLAVNÁ EVALUAČNÁ FUNKCIA
# -----------------------------------------------------------------------------

def evaluate_query(query_string, searcher, relevant_docs, total_docs, max_results=20):
    """
    Komplexná evaluácia jedného dotazu so všetkými metrikami.
    Vracia slovník s set-based, rank-aware metrikami a confusion matrix.
    """
    # Vyhľadaj dokumenty
    hits, filters = execute_search(query_string, searcher, max_results=max_results)
    
    # Extrahuj názvy hier z výsledkov (zachovaj poradie pre rank-aware metriky)
    retrieved_names = [hit.get("full_name", "").lower().strip() for hit in hits]
    retrieved_set = set(retrieved_names)
    
    # Normalizuj relevant docs
    relevant_set = set(doc.lower().strip() for doc in relevant_docs)
    
    # Set-based metriky
    precision = calculate_precision(retrieved_set, relevant_set)
    recall = calculate_recall(retrieved_set, relevant_set)
    f1 = calculate_f1(precision, recall)
    accuracy = calculate_accuracy(retrieved_set, relevant_set, total_docs)
    
    # Rank-aware metriky
    p_at_5 = calculate_precision_at_k(retrieved_names, relevant_set, 5)
    p_at_10 = calculate_precision_at_k(retrieved_names, relevant_set, 10)
    ap = calculate_average_precision(retrieved_names, relevant_set)
    ndcg_5 = calculate_ndcg(retrieved_names, relevant_set, k=5)
    ndcg_10 = calculate_ndcg(retrieved_names, relevant_set, k=10)
    
    # Confusion matrix
    tp, fp, fn, tn = get_confusion_matrix(retrieved_set, relevant_set, total_docs)
    
    return {
        'query': query_string,
        'retrieved': retrieved_names,
        'relevant': relevant_docs,
        # Set-based
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy,
        # Rank-aware
        'p_at_5': p_at_5,
        'p_at_10': p_at_10,
        'ap': ap,
        'ndcg_5': ndcg_5,
        'ndcg_10': ndcg_10,
        # Confusion matrix
        'tp': tp, 'fp': fp, 'fn': fn, 'tn': tn
    }


print("✓ Evaluation functions loaded")
print("  Detailný popis metrík: viď DOKUMENTACIA.md (sekcia 5.1)")

In [None]:
# =============================================================================
# GROUND TRUTH DATASET - Manuálne definované relevantné dokumenty
# =============================================================================
# Tieto dáta slúžia ako "zlatý štandard" pre evaluáciu
# Relevantné hry boli vybrané na základe očakávaných výsledkov z RAWG.io

GROUND_TRUTH = {
    # Keyword searches
    "witcher": [
        "The Witcher 3: Wild Hunt",
        "The Witcher 2: Assassins of Kings",
        "The Witcher",
        "The Witcher 3: Wild Hunt - Blood and Wine",
        "The Witcher 3: Wild Hunt - Hearts of Stone",
        "Thronebreaker: The Witcher Tales",
        "The Witcher Adventure Game",
    ],
    
    "dark souls": [
        "Dark Souls",
        "Dark Souls II",
        "Dark Souls III",
        "Dark Souls: Remastered",
        "Dark Souls II: Scholar of the First Sin",
    ],
    
    "grand theft auto": [
        "Grand Theft Auto V",
        "Grand Theft Auto IV",
        "Grand Theft Auto: San Andreas",
        "Grand Theft Auto: Vice City",
        "Grand Theft Auto III",
        "Grand Theft Auto: Vice City Stories",
        "Grand Theft Auto: Liberty City Stories",
    ],
    
    "fifa": [
        "FIFA 23",
        "FIFA 22",
        "FIFA 21",
        "FIFA 20",
        "FIFA 19",
        "EA Sports FC 24",
    ],
    
    "assassin creed": [
        "Assassin's Creed",
        "Assassin's Creed II",
        "Assassin's Creed: Brotherhood",
        "Assassin's Creed: Revelations",
        "Assassin's Creed III",
        "Assassin's Creed IV: Black Flag",
        "Assassin's Creed: Unity",
        "Assassin's Creed: Syndicate",
        "Assassin's Creed: Origins",
        "Assassin's Creed: Odyssey",
        "Assassin's Creed: Valhalla",
        "Assassin's Creed Mirage",
    ],
    
    "call of duty": [
        "Call of Duty: Modern Warfare",
        "Call of Duty: Warzone",
        "Call of Duty: Black Ops Cold War",
        "Call of Duty: Vanguard",
        "Call of Duty: Modern Warfare II",
        "Call of Duty: Black Ops",
        "Call of Duty 4: Modern Warfare",
    ],
    
    "minecraft": [
        "Minecraft",
        "Minecraft: Story Mode",
        "Minecraft Dungeons",
        "Minecraft Legends",
    ],
    
    "apex legends": [
        "Apex Legends",
    ],
    
    "fortnite": [
        "Fortnite",
        "Fortnite Battle Royale",
    ],
    
    "cyberpunk": [
        "Cyberpunk 2077",
        "Cyberpunk 2077: Phantom Liberty",
    ],
    
    # Field-specific searches
    "genre:shooter": [
        "Call of Duty: Modern Warfare",
        "Apex Legends",
        "Counter-Strike: Global Offensive",
        "Valorant",
        "Overwatch",
        "Battlefield 2042",
        "Destiny 2",
        "Halo Infinite",
    ],
    
    "publisher:electronic": [
        "FIFA 23",
        "Apex Legends",
        "Battlefield 2042",
        "Need for Speed Heat",
        "Star Wars Jedi: Survivor",
        "EA Sports FC 24",
        "Mass Effect Legendary Edition",
    ],
    
    "publisher:2k": [
        "BioShock",
        "BioShock Infinite",
        "Borderlands 3",
        "NBA 2K24",
        "WWE 2K23",
        "Civilization VI",
        "XCOM 2",
    ],
}

# RAWG.io porovnávacie výsledky (top 5 pre každý dotaz)
RAWG_RESULTS = {
    "witcher": [
        "Gwent: The Witcher Card Game",
        "The Witcher", 
        "Thronebreaker: The Witcher Tales",
        "The Witcher (A New Saga Begins)",
        "The Witcher: Enhanced Edition Director's Cut",
    ],
    "dark souls": [
        "Dark Souls",
        "Dark Souls: Remastered",
        "Dark Souls: Artorias of the Abyss",
        "Dark Souls: Prepare To Die Edition",
        "Dark Souls III: Ashes of Ariandel",
    ],
    "apex legends": [
        "Ape Out",
        "Ape Escape",
        "Ape-Man",
    ],
    "minecraft": [
        "Minecraft",
        "Minecraft Dungeons",
        "Minecraft: Pocket Edition",
        "Minecraft: Story Mode",
    ],
}


print(f"✓ Ground truth loaded: {len(GROUND_TRUTH)} queries")
print(f"✓ RAWG comparison data: {len(RAWG_RESULTS)} queries")

In [None]:
# =============================================================================
# EVALUATION RUNNER - Spustenie evaluácie na ground truth datasete
# =============================================================================

def run_evaluation(ground_truth, searcher, total_docs, verbose=True):
    """
    Spustí evaluáciu na všetkých dotazoch z ground truth datasetu.
    
    Args:
        ground_truth: Dict {query: [relevant_docs]}
        searcher: IndexSearcher
        total_docs: Celkový počet dokumentov v indexe
        verbose: Či zobrazovať priebežné výsledky
    
    Returns:
        results: List výsledkov pre každý dotaz
        summary: Agregované metriky (avg NDCG, ...)
    """
    results = []
    all_ap_scores = []
    all_ndcg5_scores = []
    all_ndcg10_scores = []
    
    for query, relevant_docs in ground_truth.items():
        result = evaluate_query(query, searcher, relevant_docs, total_docs)
        results.append(result)
        
        all_ap_scores.append(result['ap'])
        all_ndcg5_scores.append(result['ndcg_5'])
        all_ndcg10_scores.append(result['ndcg_10'])
        
        if verbose:
            print(f"\n{'='*60}")
            print(f"Query: '{query}'")
            print(f"{'='*60}")
            print(f"  Relevantné dokumenty: {len(relevant_docs)}")
            print(f"  Vrátené dokumenty: {len(result['retrieved'])}")
            print(f"  True Positives: {result['tp']}")
            print()
            print("  SET-BASED METRIKY:")
            print(f"    Precision: {result['precision']:.3f}")
            print(f"    Recall:    {result['recall']:.3f}")
            print(f"    F1-Score:  {result['f1']:.3f}")
            print()
            print("  RANK-AWARE METRIKY:")
            print(f"    P@5:       {result['p_at_5']:.3f}")
            print(f"    P@10:      {result['p_at_10']:.3f}")
            print(f"    AP:        {result['ap']:.3f}")
            print(f"    NDCG@5:    {result['ndcg_5']:.3f}")
            print(f"    NDCG@10:   {result['ndcg_10']:.3f}")
    
    # Agregované metriky
    avg_ap = np.mean(all_ap_scores) if all_ap_scores else 0.0
    avg_ndcg5 = np.mean(all_ndcg5_scores) if all_ndcg5_scores else 0.0
    avg_ndcg10 = np.mean(all_ndcg10_scores) if all_ndcg10_scores else 0.0
    
    avg_precision = np.mean([r['precision'] for r in results])
    avg_recall = np.mean([r['recall'] for r in results])
    avg_f1 = np.mean([r['f1'] for r in results])
    
    summary = {
        'num_queries': len(results),
        'avg_precision': avg_precision,
        'avg_recall': avg_recall,
        'avg_f1': avg_f1,
        'avg_ap': avg_ap,
        'avg_ndcg_5': avg_ndcg5,
        'avg_ndcg_10': avg_ndcg10,
    }
    
    if verbose:
        print(f"\n{'='*60}")
        print("CELKOVÉ VÝSLEDKY (SÚHRN)")
        print(f"{'='*60}")
        print(f"  Počet dotazov: {summary['num_queries']}")
        print()
        print("  AGREGOVANÉ SET-BASED METRIKY:")
        print(f"    Avg Precision:  {summary['avg_precision']:.3f}")
        print(f"    Avg Recall:     {summary['avg_recall']:.3f}")
        print(f"    Avg F1-Score:   {summary['avg_f1']:.3f}")
        print()
        print("  AGREGOVANÉ RANK-AWARE METRIKY:")
        print(f"    Avg AP:         {summary['avg_ap']:.3f}")
        print(f"    Avg NDCG@5:     {summary['avg_ndcg_5']:.3f}")
        print(f"    Avg NDCG@10:    {summary['avg_ndcg_10']:.3f}")
        print()
        print("  INTERPRETÁCIA NDCG:")
        if avg_ndcg10 >= 0.8:
            print("    ✓ NDCG > 0.80: Vynikajúci ranking výsledkov")
        elif avg_ndcg10 >= 0.6:
            print("    ○ NDCG 0.60-0.80: Dobrý ranking výsledkov")
        else:
            print("    ✗ NDCG < 0.60: Ranking potrebuje zlepšenie")
    
    return results, summary

print("✓ Evaluation runner loaded")

In [None]:
# =============================================================================
# SPUSTENIE EVALUÁCIE
# =============================================================================

# Počet dokumentov v indexe
total_docs = searcher.count()
print(f"Celkový počet dokumentov v indexe: {total_docs}\n")

# Spustíme evaluáciu
results, summary = run_evaluation(
    ground_truth=GROUND_TRUTH,
    searcher=searcher,
    total_docs=total_docs,
    verbose=True
)

In [None]:
# =============================================================================
# VIZUALIZÁCIA VÝSLEDKOV EVALUÁCIE
# =============================================================================

import matplotlib.pyplot as plt

def visualize_evaluation(results, summary):
    """
    Vytvorí vizualizácie výsledkov evaluácie.
    """
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    queries = [r['query'][:25] + '...' if len(r['query']) > 25 else r['query'] for r in results]
    
    # 1. Set-based metriky per query
    ax1 = axes[0, 0]
    x = range(len(results))
    width = 0.25
    ax1.bar([i - width for i in x], [r['precision'] for r in results], width, label='Precision', color='steelblue')
    ax1.bar([i for i in x], [r['recall'] for r in results], width, label='Recall', color='coral')
    ax1.bar([i + width for i in x], [r['f1'] for r in results], width, label='F1', color='seagreen')
    ax1.set_xlabel('Query')
    ax1.set_ylabel('Score')
    ax1.set_title('Set-based metriky (Precision, Recall, F1)')
    ax1.set_xticks(x)
    ax1.set_xticklabels(queries, rotation=45, ha='right', fontsize=8)
    ax1.legend()
    ax1.set_ylim(0, 1.1)
    ax1.grid(axis='y', alpha=0.3)
    
    # 2. Rank-aware metriky per query (AP, NDCG)
    ax2 = axes[0, 1]
    width = 0.25
    ax2.bar([i - width for i in x], [r['ap'] for r in results], width, label='AP', color='mediumpurple')
    ax2.bar([i for i in x], [r['ndcg_5'] for r in results], width, label='NDCG@5', color='gold')
    ax2.bar([i + width for i in x], [r['ndcg_10'] for r in results], width, label='NDCG@10', color='orange')
    ax2.set_xlabel('Query')
    ax2.set_ylabel('Score')
    ax2.set_title('Rank-aware metriky (AP, NDCG)')
    ax2.set_xticks(x)
    ax2.set_xticklabels(queries, rotation=45, ha='right', fontsize=8)
    ax2.legend()
    ax2.set_ylim(0, 1.2)
    ax2.grid(axis='y', alpha=0.3)
    
    # 3. Agregované metriky - porovnanie
    ax3 = axes[1, 0]
    metrics = ['Precision', 'Recall', 'F1', 'AP', 'NDCG@5', 'NDCG@10']
    values = [
        summary['avg_precision'],
        summary['avg_recall'], 
        summary['avg_f1'],
        summary['avg_ap'],
        summary['avg_ndcg_5'],
        summary['avg_ndcg_10']
    ]
    colors = ['steelblue', 'coral', 'seagreen', 'mediumpurple', 'gold', 'orange']
    bars = ax3.bar(metrics, values, color=colors)
    ax3.set_ylabel('Score')
    ax3.set_title('Agregované metriky (priemer cez všetky dotazy)')
    ax3.set_ylim(0, 1.1)
    ax3.grid(axis='y', alpha=0.3)
    # Pridaj hodnoty nad stĺpce
    for bar, val in zip(bars, values):
        ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                f'{val:.2f}', ha='center', va='bottom', fontsize=9)
    
    # 4. P@K porovnanie
    ax4 = axes[1, 1]
    p5_values = [r['p_at_5'] for r in results]
    p10_values = [r['p_at_10'] for r in results]
    width = 0.35
    ax4.bar([i - width/2 for i in x], p5_values, width, label='P@5', color='royalblue')
    ax4.bar([i + width/2 for i in x], p10_values, width, label='P@10', color='lightblue')
    ax4.set_xlabel('Query')
    ax4.set_ylabel('Precision@K')
    ax4.set_title('Precision at K (P@5 vs P@10)')
    ax4.set_xticks(x)
    ax4.set_xticklabels(queries, rotation=45, ha='right', fontsize=8)
    ax4.legend()
    ax4.set_ylim(0, 1.1)
    ax4.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('/workspace/data/evaluation_results.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print("\n✓ Graf uložený do /workspace/data/evaluation_results.png")

# Spusti vizualizáciu
visualize_evaluation(results, summary)

In [None]:
# =============================================================================
# TABUĽKOVÝ SÚHRN VÝSLEDKOV
# =============================================================================

import pandas as pd

# Vytvor DataFrame s výsledkami
df_results = pd.DataFrame([{
    'Query': r['query'],
    'TP': r['tp'],
    'Precision': f"{r['precision']:.3f}",
    'Recall': f"{r['recall']:.3f}",
    'F1': f"{r['f1']:.3f}",
    'AP': f"{r['ap']:.3f}",
    'NDCG@5': f"{r['ndcg_5']:.3f}",
    'NDCG@10': f"{r['ndcg_10']:.3f}",
    'P@5': f"{r['p_at_5']:.3f}",
    'P@10': f"{r['p_at_10']:.3f}",
} for r in results])

print("="*100)
print("DETAILNÉ VÝSLEDKY PRE VŠETKY DOTAZY")
print("="*100)
display(df_results)

# Súhrnná tabuľka
print("\n" + "="*100)
print("SÚHRN AGREGOVANÝCH METRÍK")
print("="*100)

summary_data = {
    'Metrika': [
        '--- SET-BASED ---',
        'Avg Precision',
        'Avg Recall', 
        'Avg F1-Score',
        '--- RANK-AWARE ---',
        'Avg AP (Average Precision)',
        'Avg NDCG@5',
        'Avg NDCG@10',
    ],
    'Hodnota': [
        '',
        f"{summary['avg_precision']:.4f}",
        f"{summary['avg_recall']:.4f}",
        f"{summary['avg_f1']:.4f}",
        '',
        f"{summary['avg_ap']:.4f}",
        f"{summary['avg_ndcg_5']:.4f}",
        f"{summary['avg_ndcg_10']:.4f}",
    ],
    'Popis': [
        'Metriky porovnávajúce množiny (bez poradia)',
        'Pomer relevantných z vrátených',
        'Pomer nájdených z relevantných',
        'Harmonický priemer P a R',
        'Metriky zohľadňujúce poradie',
        'Priemerná presnosť na relevantných pozíciách',
        'Normalizované DCG (top 5)',
        'Normalizované DCG (top 10)',
    ]
}

df_summary = pd.DataFrame(summary_data)
display(df_summary)

# Uloženie výsledkov
df_results.to_csv('/workspace/data/evaluation_results.csv', index=False)
print("\n✓ Výsledky uložené do /workspace/data/evaluation_results.csv")