In [7]:
!pip install gradio sentence-transformers faiss-cpu requests



In [9]:
import os
import zipfile
import tempfile
from typing import List, Dict, Callable, Tuple, Optional
from pathlib import Path
from datetime import date, datetime
import calendar
import json

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import requests  # for OpenRouter HTTP calls

import gradio as gr


# ===========================
# 0) OpenRouter + Claude helper
# ===========================

DEFAULT_MODEL_ID = "anthropic/claude-3.5-sonnet"
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
LOG_PATH = "lkp_rag_log.json"


def ask(
    prompt: str,
    api_key: str,
    model_id: Optional[str] = None,
    temperature: float = 0.2,
) -> str:
    """
    Call LLM via OpenRouter with a single user prompt.
    model_id:
      - ja None, izmanto DEFAULT_MODEL_ID
      - citādi – lieto, ko nodeva GUI (dropdown vai custom)
    temperature:
      - no 0.0 līdz 1.0 (konkretizācijai / radošumam)
    """
    if not api_key.strip():
        raise RuntimeError("OpenRouter API key nav norādīts.")

    model_id = (model_id or DEFAULT_MODEL_ID).strip()

    headers = {
        "Authorization": f"Bearer {api_key.strip()}",
        "Content-Type": "application/json",
    }

    data = {
        "model": model_id,
        "messages": [
            {
                "role": "system",
                "content": (
                    "Tu esi precīzs, kritisks vēsturnieks, kas analizē "
                    "Latvijas komunistisko pagrīdes organizāciju skrejlapas (1934–1940)."
                ),
            },
            {"role": "user", "content": prompt},
        ],
        "temperature": float(temperature),
    }

    resp = requests.post(OPENROUTER_URL, headers=headers, json=data, timeout=60)
    resp.raise_for_status()
    return resp.json()["choices"][0]["message"]["content"]


# ===========================
# 0b) Žurnāla (loga) palīgfunkcijas
# ===========================

def _serialize_filters_for_log(filters: Dict) -> Dict:
    """
    Padara filtrus JSON-serializējamus (datumi -> ISO string).
    """
    out = dict(filters)
    for key in ("date_from", "date_to"):
        v = out.get(key)
        if isinstance(v, date):
            out[key] = v.isoformat()
    return out


def _serialize_chunk_for_log(chunk: Dict, preview_chars: int = 400) -> Dict:
    """
    Izvelk tikai vajadzīgo informāciju no fragmenta + īsu teksta fragmentu.
    """
    text = chunk.get("text", "") or ""
    if len(text) > preview_chars:
        text_snippet = text[:preview_chars] + "..."
    else:
        text_snippet = text

    return {
        "leaflet_id": chunk.get("leaflet_id"),
        "file_name": chunk.get("file_name"),
        "title": chunk.get("title"),
        "date": chunk.get("date"),
        "print_run": chunk.get("print_run"),
        "author": chunk.get("author"),
        "source": chunk.get("source"),
        "chunk_id": chunk.get("chunk_id"),
        "score": chunk.get("score"),
        "text_snippet": text_snippet,
    }


def log_qa_event(
    question: str,
    answer: str,
    retrieved_chunks: List[Dict],
    filters: Dict,
    model_id: str,
    top_k: int,
    temperature: float,
    preview_chars_for_log: int = 400,
) -> None:
    """
    Pieraksta vienu Q&A sesiju JSON failā LOG_PATH kā elementu masīvā.
    Struktūra failā: [ {event1}, {event2}, ... ]
    """
    event = {
        "timestamp_utc": datetime.utcnow().isoformat() + "Z",
        "question": question,
        "answer": answer,
        "model_id": model_id,
        "top_k": int(top_k),
        "temperature": float(temperature),
        "filters": _serialize_filters_for_log(filters or {}),
        "retrieved_chunks": [
            _serialize_chunk_for_log(c, preview_chars_for_log)
            for c in (retrieved_chunks or [])
        ],
    }

    try:
        if os.path.exists(LOG_PATH):
            try:
                with open(LOG_PATH, "r", encoding="utf-8") as f:
                    content = f.read().strip()
                    if content:
                        data = json.loads(content)
                    else:
                        data = []
            except Exception:
                data = []
        else:
            data = []

        if not isinstance(data, list):
            data = [data]

        data.append(event)

        with open(LOG_PATH, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

    except Exception as e:
        print(f"Warning: could not write log: {e}")


def get_log_file_for_gui():
    """
    Funkcija Gradio pogai:
    - ja žurnāla fails neeksistē, izveido tukšu JSON masīvu [] ,
    - atgriež ceļu uz LOG_PATH, lai to var lejupielādēt.
    """
    if not os.path.exists(LOG_PATH):
        with open(LOG_PATH, "w", encoding="utf-8") as f:
            json.dump([], f, ensure_ascii=False, indent=2)
    return LOG_PATH


# ===========================
# 1) Palīgfunkcijas datumiem un tirāžai
# ===========================

def _parse_plain_date_to_range(s: str) -> Tuple[Optional[date], Optional[date]]:
    """
    '1934-08-20' -> (1934-08-20, 1934-08-20)
    '1934-08'    -> (1934-08-01, 1934-08-31)
    '1934'       -> (1934-01-01, 1934-12-31)
    Любой мусор -> (None, None)
    """
    s = (s or "").strip()
    if not s:
        return None, None

    parts = s.split("-")
    try:
        if len(parts) == 3:
            y, m, d = map(int, parts)
            dt = date(y, m, d)
            return dt, dt
        elif len(parts) == 2:
            y, m = map(int, parts)
            last_day = calendar.monthrange(y, m)[1]
            return date(y, m, 1), date(y, m, last_day)
        else:
            y = int(parts[0])
            return date(y, 1, 1), date(y, 12, 31)
    except Exception:
        return None, None


def parse_metadata_date_to_range(date_str: str) -> Tuple[Optional[date], Optional[date]]:
    """
    All variants:
    - 'YYYY-MM-DD'
    - 'YYYY-MM'
    - '[YYYY-MM-DD...]'
    - '[...YYYY-MM-DD]'
    - '[YYYY-MM-DD..YYYY-MM-DD]'
    """
    if not date_str:
        return None, None

    date_str = date_str.strip()

    if not (date_str.startswith("[") and date_str.endswith("]")):
        return _parse_plain_date_to_range(date_str)

    inner = date_str[1:-1].strip()

    if "..." in inner:
        if inner.endswith("..."):
            left = inner[:-3].strip()
            start, end = _parse_plain_date_to_range(left)
            return start, None
        elif inner.startswith("..."):
            right = inner[3:].strip()
            start, end = _parse_plain_date_to_range(right)
            return None, end

    if ".." in inner:
        left, right = inner.split("..", 1)
        left = left.strip()
        right = right.strip()
        s1, e1 = _parse_plain_date_to_range(left)
        s2, e2 = _parse_plain_date_to_range(right)
        start = s1
        end = e2
        return start, end

    return _parse_plain_date_to_range(inner)


def parse_print_run_value(v: str) -> Optional[int]:
    """
    '1000' -> 1000
    'unk'  -> None
    'ap 5000' -> 5000
    """
    if not v:
        return None
    v_low = v.strip().lower()
    if v_low == "unk":
        return None
    digits = "".join(ch for ch in v_low if ch.isdigit())
    if not digits:
        return None
    try:
        return int(digits)
    except ValueError:
        return None


def parse_user_date_box(s: str, is_start: bool) -> Optional[date]:
    """
    GUI: 'Datums no' / 'Datums līdz' (YYYY, YYYY-MM, YYYY-MM-DD).
    """
    s = (s or "").strip()
    if not s:
        return None
    parts = s.split("-")
    if len(parts) == 3:
        y, m, d = map(int, parts)
        return date(y, m, d)
    elif len(parts) == 2:
        y, m = map(int, parts)
        if is_start:
            return date(y, m, 1)
        else:
            last_day = calendar.monthrange(y, m)[1]
            return date(y, m, last_day)
    else:
        y = int(parts[0])
        return date(y, 1, 1) if is_start else date(y, 12, 31)


# ===========================
# 1b) Loading and parsing leaflets from ZIP
# ===========================

def parse_metadata(content: str) -> Dict:
    """
    Parse metadata section from file content.
    """
    parts = content.split("text:", 1)
    metadata_text = parts[0]

    metadata: Dict = {
        "id": None,
        "file_name": "",
        "title": "",
        "author": "",
        "date": "",
        "print_run": "",
        "typography_name": "",
        "source": "",
        "text": "",
    }

    for line in metadata_text.split("\n"):
        line = line.strip()
        if not line:
            continue

        if ":" in line:
            key, value = line.split(":", 1)
            key = key.strip()
            value = value.strip()

            if key == "id":
                try:
                    metadata[key] = int(value)
                except ValueError:
                    metadata[key] = None
            elif key in metadata:
                metadata[key] = value

    if len(parts) > 1:
        metadata["text"] = parts[1].strip()

    raw_date = metadata.get("date", "")
    try:
        d_start, d_end = parse_metadata_date_to_range(raw_date)
    except Exception as e:
        print(
            f"Warning: cannot parse date '{raw_date}' "
            f"in file {metadata.get('file_name','')}: {e}"
        )
        d_start, d_end = None, None

    metadata["date_start"] = d_start
    metadata["date_end"] = d_end
    metadata["print_run_value"] = parse_print_run_value(metadata.get("print_run", ""))

    return metadata


def load_leaflets_from_zip(zip_path: str) -> List[Dict]:
    """
    Load and parse leaflets from ZIP archive.
    """
    results: List[Dict] = []

    with tempfile.TemporaryDirectory() as temp_dir:
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(temp_dir)

        corpus_dir: Optional[str] = None
        if any(Path(temp_dir).glob("*.txt")):
            corpus_dir = temp_dir
        else:
            for item in os.listdir(temp_dir):
                potential_corpus_dir = os.path.join(temp_dir, item)
                if os.path.isdir(potential_corpus_dir) and any(
                    Path(potential_corpus_dir).glob("*.txt")
                ):
                    corpus_dir = potential_corpus_dir
                    break

        if not corpus_dir:
            raise ValueError("Cannot find corpus directory with .txt files in ZIP file")

        for file_path in Path(corpus_dir).glob("*.txt"):
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()

                leaflet_data = parse_metadata(content)
                leaflet_data["path"] = str(file_path)
                if not leaflet_data.get("file_name"):
                    leaflet_data["file_name"] = file_path.name
                if not leaflet_data.get("title"):
                    leaflet_data["title"] = file_path.stem

                results.append(leaflet_data)

            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    return results


# ===========================
# 2) RAG utilities: chunking + class LeafletRAG
# ===========================

def chunk_text(text: str, max_words: int = 300) -> List[str]:
    words = text.split()
    if not words:
        return []

    chunks: List[str] = []
    current: List[str] = []

    for w in words:
        current.append(w)
        if len(current) >= max_words:
            chunks.append(" ".join(current))
            current = []

    if current:
        chunks.append(" ".join(current))

    return chunks


def chunk_matches_filters(chunk: Dict, filters: Dict) -> bool:
    """
    Pārbauda, vai fragments atbilst GUI filtriem.
    """
    # Datums
    df = filters.get("date_from")
    dt = filters.get("date_to")
    if df or dt:
        s = chunk.get("date_start")
        e = chunk.get("date_end")
        if s is None and e is None:
            return False
        if df and e is not None and e < df:
            return False
        if dt and s is not None and s > dt:
            return False

    # Tirāža
    pr_min = filters.get("print_run_min")
    pr_max = filters.get("print_run_max")
    include_unk = bool(filters.get("include_unk_print_run", False))

    if pr_min is not None or pr_max is not None:
        pr = chunk.get("print_run_value")
        if pr is None:
            if not include_unk:
                return False
        else:
            if pr_min is not None and pr < pr_min:
                return False
            if pr_max is not None and pr > pr_max:
                return False

    # Organizācija – substring search author+source+title+file_name
    org_subs = filters.get("org_substrings") or []
    if org_subs:
        org_meta = (
            (chunk.get("author", "") + " " +
             chunk.get("source", "") + " " +
             chunk.get("title", "") + " " +
             chunk.get("file_name", ""))
            .lower()
        )
        if not any(sub in org_meta for sub in org_subs):
            return False

    return True


class LeafletRAG:
    """
    Simple RAG system for the Latvian Communist Leaflet Corpus.
    """

    def __init__(
        self,
        model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    ):
        self.model = SentenceTransformer(model_name)
        self.index: Optional[faiss.IndexFlatIP] = None
        self.chunks: List[Dict] = []
        self.embedding_dim: Optional[int] = None

    def build_index(
        self,
        leaflets: List[Dict],
        max_words_per_chunk: int = 260,
        normalize_embeddings: bool = True,
    ) -> None:
        all_texts: List[str] = []
        self.chunks = []

        for leaflet in leaflets:
            full_text = leaflet.get("text", "")
            if not full_text.strip():
                continue

            leaflet_id = leaflet.get("id")
            file_name = leaflet.get("file_name", "")
            title = leaflet.get("title", "")
            date_str = leaflet.get("date", "")
            date_start = leaflet.get("date_start")
            date_end = leaflet.get("date_end")
            print_run = leaflet.get("print_run", "")
            print_run_value = leaflet.get("print_run_value")
            author = leaflet.get("author", "")
            source = leaflet.get("source", "")

            chunk_list = chunk_text(full_text, max_words=max_words_per_chunk)
            for i, chunk in enumerate(chunk_list):
                self.chunks.append(
                    {
                        "leaflet_id": leaflet_id,
                        "file_name": file_name,
                        "title": title,
                        "date": date_str,
                        "date_start": date_start,
                        "date_end": date_end,
                        "print_run": print_run,
                        "print_run_value": print_run_value,
                        "author": author,
                        "source": source,
                        "chunk_id": i,
                        "text": chunk,
                    }
                )
                all_texts.append(chunk)

        if not all_texts:
            raise ValueError("No text chunks found. Cannot build index.")

        embeddings = self.model.encode(all_texts, convert_to_numpy=True)

        if normalize_embeddings:
            norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
            embeddings = embeddings / norms

        self.embedding_dim = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(self.embedding_dim)
        self.index.add(embeddings)

    def retrieve(
        self,
        query: str,
        top_k: int = 5,
        normalize_embeddings: bool = True,
        filters: Optional[Dict] = None,
    ) -> List[Dict]:
        if self.index is None or not self.chunks:
            raise RuntimeError("Index is not built. Call build_index() first.")

        if filters is None:
            filters = {}

        query_emb = self.model.encode([query], convert_to_numpy=True)

        if normalize_embeddings:
            norms = np.linalg.norm(query_emb, axis=1, keepdims=True) + 1e-12
            query_emb = query_emb / norms

        search_k = min(max(top_k * 5, top_k + 20), len(self.chunks))
        scores, indices = self.index.search(query_emb, search_k)
        scores = scores[0]
        indices = indices[0]

        results: List[Dict] = []
        for score, idx in zip(scores, indices):
            if idx < 0 or idx >= len(self.chunks):
                continue
            chunk_info = self.chunks[idx].copy()
            chunk_info["score"] = float(score)

            if not chunk_matches_filters(chunk_info, filters):
                continue

            results.append(chunk_info)
            if len(results) >= top_k:
                break

        return results

    def answer(
        self,
        query: str,
        top_k: int = 5,
        generator_fn: Optional[Callable[[str, List[Dict]], str]] = None,
        filters: Optional[Dict] = None,
    ) -> Tuple[str, List[Dict]]:
        retrieved = self.retrieve(query, top_k=top_k, filters=filters)

        if generator_fn is None:
            context_text = "\n\n---\n\n".join(
                f"[{i+1}] {c['text']}" for i, c in enumerate(retrieved)
            )
            answer_text = (
                "Simple concatenation answer.\n\n"
                f"Query: {query}\n\n"
                "Relevant leaflet chunks:\n\n"
                f"{context_text}"
            )
            return answer_text, retrieved

        answer_text = generator_fn(query, retrieved)
        return answer_text, retrieved


def simple_llm_prompt_builder(query: str, chunks: List[Dict]) -> str:
    context_blocks = []
    for i, c in enumerate(chunks, start=1):
        meta = (
            f"title={c.get('title', '')}, "
            f"date={c.get('date', '')}, "
            f"file={c.get('file_name', '')}, "
            f"chunk_id={c.get('chunk_id', '')}"
        )
        block = f"[{i}] ({meta})\n{c.get('text', '')}"
        context_blocks.append(block)

    context_str = "\n\n---\n\n".join(context_blocks)

    prompt = f"""
Tu esi vēsturnieks, kurš analizē Latvijas komunistisko pagrīdes organizāciju skrejlapas (1934–1940).
Tev ir pieejami tikai zemāk dotie skrejlapu fragmenti.

Tavi metodoloģiskie principi:

1. Atbildi uz jautājumu, balstoties TIKAI uz dotajiem fragmentiem. Nekādu ārēju zināšanu.
2. NEIZDOMĀT faktus. Ja avotos nav tiešas norādes, tas jāklasificē kā nezināms.
3. Katrai atbildes sadaļai ir stingras prasības:

I. **Droši fakti (tieši avotos)**
– Iekļauj tikai informāciju, kas skaidri minēta tekstā.
– Katram faktam pievieno atsauci uz fragmentu (piem., “[3]”).
– Ja iespējams, pievieno īsu citātu no avota (piem., “avots saka: ‘piesprieda nāves sodu Murinam’)
– Nekad neapvieno divus atsevišķos avotos minētus faktus vienā, ja avots tos nesaista.

II. **Piesardzīgie secinājumi (netieši, bet atļauti)**
– Atļauts tikai tad, ja secinājums loģiski izriet no fragmentu formulējumiem.
– Vienmēr norādi, ka tas ir NETIEŠS secinājums.
– Nekad nepaplašini faktus ārpus tā, ko teksts atļauj.

III. **Nezināmais**
– Skaidri norādi visu, ko no avotiem noteikt NAV iespējams.
– Šo sadaļu vienmēr iekļauj, pat ja šķiet, ka viss ir skaidrs.
– Ja atbilde nav nosakāma, skaidri uzraksti:
  **"To nav iespējams noteikt, balstoties tikai на šeit dotajiem avotiem."**

4. Stilam jābūt akadēmiski precīzam, konspektīvam, bez retorikas un vispārinājumiem.
5. Atsaucies tikai uz informāciju, kas patiešām ir fragmentos.
6. Nekad neizdari pieņēmumus par kontekstu, motīviem vai faktiem, kas nav tieši norādīti tekstā.

---

Skrejlapu fragmenti:
{context_str}

Jautājums:
{query}

Tagad sniedz īsu, stingri strukturētu atbildi LATVIEŠU valodā tieši šādā formā:

**I. Droši fakti (tieši avotos)**
– ...

**II. Piesardzīgie secinājumi (no fragmentiem izrietoši)**
– ...

**III. Nezināmais**
– ...
"""
    return prompt.strip()


def print_retrieved_chunks(chunks: List[Dict], max_chars: int = 300) -> None:
    print("\n=== RETRIEVED CHUNKS ===\n")
    for i, c in enumerate(chunks, start=1):
        print(f"[{i}] score={c.get('score', 0):.4f}")
        print(f"    title     : {c.get('title', '')}")
        print(f"    date      : {c.get('date', '')}")
        print(f"    file_name : {c.get('file_name', '')}")
        print(f"    chunk_id  : {c.get('chunk_id', '')}")
        print(f"    print_run : {c.get('print_run', '')}")
        print(f"    author    : {c.get('author', '')}")
        text = c.get("text", "")
        if len(text) > max_chars:
            text = text[:max_chars] + "..."
        print("    text:")
        print("    " + text.replace("\n", "\n    "))
        print()


# ===========================
# 4) Gradio GUI – Latvijas komunistisko skrejlapu asistents
# ===========================

global_rag = None
global_leaflets = None


def build_rag_from_zip_gui(zip_file):
    """Augšupielādē Latvijas komunistisko skrejlapu ZIP un uzbūvē RAG indeksu."""
    global global_rag, global_leaflets

    if zip_file is None:
        return "Nav augšupielādēts ZIP fails.", ""

    zip_path = zip_file.name

    try:
        leaflets = load_leaflets_from_zip(zip_path)
    except Exception as e:
        return f"Kļūda, lasot ZIP: {e}", ""

    if not leaflets:
        return "Neizdevās nolasīt nevienu skrejlapu от ZIP.", ""

    rag = LeafletRAG()
    rag.build_index(leaflets)

    global_rag = rag
    global_leaflets = leaflets

    info = (
        f"Indekss uzbūvēts. Ielādētas {len(leaflets)} skrejlapas. "
        f"Kopējais fragmentu skaits: {len(rag.chunks)}."
    )
    return info, ""


def build_filters_from_inputs(
    date_from_str: str,
    date_to_str: str,
    print_run_min,
    print_run_max,
    org_custom: str,
    include_unk_print_run: bool,
) -> Dict:
    df = parse_user_date_box(date_from_str, is_start=True) if date_from_str else None
    dt = parse_user_date_box(date_to_str, is_start=False) if date_to_str else None

    pr_min = None
    if print_run_min is not None and str(print_run_min).strip() != "":
        try:
            v = int(print_run_min)
            if v > 0:
                pr_min = v
        except ValueError:
            pr_min = None

    pr_max = None
    if print_run_max is not None and str(print_run_max).strip() != "":
        try:
            v = int(print_run_max)
            if v > 0:
                pr_max = v
        except ValueError:
            pr_max = None

    org_substrings: List[str] = []

    custom = (org_custom or "").strip().lower()
    if custom:
        org_substrings.append(custom)

    org_substrings = sorted({s for s in org_substrings if s})

    return {
        "date_from": df,
        "date_to": dt,
        "print_run_min": pr_min,
        "print_run_max": pr_max,
        "include_unk_print_run": bool(include_unk_print_run),
        "org_substrings": org_substrings,
    }


def qa_on_corpus_gui(
    api_key: str,
    question: str,
    top_k: int,
    preview_chars: int,
    min_score: float,
    model_choice: str,
    date_from_str: str,
    date_to_str: str,
    print_run_min,
    print_run_max,
    org_custom: str,
    include_unk_print_run: bool,
    show_full_chunks: bool,
    temperature: float,
):
    """Atbild uz jautājumu par Latvijas komunistisko organizāciju skrejlapu korpusu."""
    global global_rag

    if global_rag is None:
        return (
            "Indekss vēl nav uzbūvēts. Lūdzu vispirms augšupielādē ZIP и nospied 'Izveidot indeksu'.",
            "",
        )

    if not api_key.strip():
        return "Nav norādīts OpenRouter API key. Lūdzu ievadi savu API key kreisajā pusē.", ""

    if not question.strip():
        return "Lūdzu ievadi jautājumu.", ""

    effective_model_id = (model_choice or DEFAULT_MODEL_ID).strip()

    filters = build_filters_from_inputs(
        date_from_str,
        date_to_str,
        print_run_min,
        print_run_max,
        org_custom,
        include_unk_print_run,
    )

    # 0.0 -> filtrs izslēgts; > 0.0 -> filtrs ieslēgts
    effective_min_score = float(min_score) if min_score is not None else 0.0
    if effective_min_score < 0.0:
        effective_min_score = 0.0

    filters["min_score"] = effective_min_score

    def local_llm_generator(q: str, chunks: List[Dict]) -> str:
        prompt = simple_llm_prompt_builder(q, chunks)
        return ask(
            prompt,
            api_key,
            model_id=effective_model_id,
            temperature=temperature,
        )

    # 1) RAG + LLM (retrieved jau ar datuma/tirāžas/organizācijas filtru)
    answer_text, retrieved = global_rag.answer(
        question,
        top_k=top_k,
        generator_fn=local_llm_generator,
        filters=filters,
    )

    # 2) Piemēro min_score только preview/logam
    if effective_min_score > 0.0:
        retrieved = [
            c for c in retrieved
            if c.get("score", 0.0) >= effective_min_score
        ]

    # 3) Logging – žurnālā liekam jau pēc score-filtra
    if answer_text is None:
        answer_for_log = "[NONE_ANSWER_FROM_MODEL]"
    else:
        stripped = answer_text.strip()
        answer_for_log = stripped if stripped else "[EMPTY_OR_WHITESPACE_ANSWER]"

    log_qa_event(
        question=question,
        answer=answer_for_log,
        retrieved_chunks=retrieved,
        filters=filters,
        model_id=effective_model_id,
        top_k=top_k,
        temperature=temperature,
        preview_chars_for_log=preview_chars,
    )

    # 4) Ja pēc score-filtra nav fragmentu
    if not retrieved:
        if effective_min_score > 0.0:
            info_msg = (
                f"Nav atrasts neviens fragments ar līdzības score "
                f"≥ {effective_min_score:.2f} (no top_k={top_k})."
            )
        else:
            info_msg = "Nav atrasts neviens atbilstošs fragments."
        return answer_text, info_msg

    # 5) Preview priekš GUI
    preview_lines = []

    if effective_min_score > 0.0:
        used_count = len(retrieved)
        preview_lines.append(
            f"[INFO] Pēc min score {effective_min_score:.2f} filtrēšanas izmantoti "
            f"{used_count} fragmenti (no top_k={top_k})."
        )

    for i, c in enumerate(retrieved, start=1):
        text = c.get("text", "")
        if not show_full_chunks and len(text) > preview_chars:
            text = text[:preview_chars] + "..."

        meta = (
            f"[{i}] score={c.get('score', 0):.4f} | "
            f"title={c.get('title','')}"
            f" date={c.get('date','')}"
            f" print_run={c.get('print_run','')}"
            f" author={c.get('author','')}"
            f" file={c.get('file_name','')}"
            f" chunk_id={c.get('chunk_id','')}"
        )
        preview_lines.append(meta + "\n" + text)

    preview_block = "\n\n---\n\n".join(preview_lines)
    return answer_text, preview_block


def retrieve_only_gui(
    question: str,
    top_k: int,
    preview_chars: int,
    min_score: float,
    date_from_str: str,
    date_to_str: str,
    print_run_min,
    print_run_max,
    org_custom: str,
    include_unk_print_run: bool,
    show_full_chunks: bool,
):
    """
    Tikai retrīvs: parāda fragmentus (bez LLM, bez OpenRouter, bez API key).
    """
    global global_rag

    if global_rag is None:
        return "Indekss vēl nav uzbūvēts. Lūdzu vispirms uzbūvē indeksu."

    if not question.strip():
        return "Lūdzu ievadi jautājumu."

    filters = build_filters_from_inputs(
        date_from_str,
        date_to_str,
        print_run_min,
        print_run_max,
        org_custom,
        include_unk_print_run,
    )

    effective_min_score = float(min_score) if min_score is not None else 0.0
    if effective_min_score < 0.0:
        effective_min_score = 0.0
    filters["min_score"] = effective_min_score

    retrieved = global_rag.retrieve(
        query=question,
        top_k=top_k,
        filters=filters,
    )

    if effective_min_score > 0.0:
        retrieved = [
            c for c in retrieved
            if c.get("score", 0.0) >= effective_min_score
        ]

    if not retrieved:
        if effective_min_score > 0.0:
            return (
                f"Nav atrasts neviens fragments ar līdzības score "
                f"≥ {effective_min_score:.2f} (no top_k={top_k})."
            )
        else:
            return "Nav atrasts neviens atbilstošs fragments."

    lines = []

    if effective_min_score > 0.0:
        used_count = len(retrieved)
        lines.append(
            f"[INFO] Pēc min score {effective_min_score:.2f} filtrēšanas izmantoti "
            f"{used_count} fragmenti (no top_k={top_k})."
        )

    for i, c in enumerate(retrieved, start=1):
        text = c.get("text", "")
        if not show_full_chunks and len(text) > preview_chars:
            text = text[:preview_chars] + "..."

        meta = (
            f"[{i}] score={c.get('score', 0):.4f} | "
            f"title={c.get('title','')}"
            f" date={c.get('date','')}"
            f" print_run={c.get('print_run','')}"
            f" author={c.get('author','')}"
            f" file={c.get('file_name','')}"
            f" chunk_id={c.get('chunk_id','')}"
        )

        lines.append(meta + "\n" + text)

    return "\n\n---\n\n".join(lines)


with gr.Blocks() as gui:
    gr.Markdown(
        "## Latvijas komunistisko organizāciju skrejlapu RAG asistents (1934–1940)\n"
        "Augšupielādē Latvijas komunistisko organizāciju skrejlapu korpusa ZIP failu un uzdod vēsturiskus jautājumus.\n"
        "Atbildes balstītas TIKAI uz skrejlapu tekstiem."
    )

    with gr.Row():
        with gr.Column():
            api_key_box = gr.Textbox(
                label="OpenRouter API key",
                type="password",
                placeholder="ievadi savu OpenRouter API key šeit",
            )

            model_choice_box = gr.Dropdown(
                label="OpenRouter modelis (vari izvēlēties vai ierakstīt pats)",
                choices=[
                    DEFAULT_MODEL_ID,
                    "anthropic/claude-3.5-haiku",

                    "openai/gpt-4.1",
                    "openai/gpt-4.1-mini",
                    "openai/gpt-4o",
                    "openai/gpt-4o-mini",

                    "qwen/qwen-2.5-7b-instruct",

                    "deepseek/deepseek-chat",

                    "mistralai/mistral-large-2512",
                    "mistralai/mistral-small-3.2-24b-instruct",
                    "mistralai/mistral-nemo",

                    "meta-llama/llama-3.1-70b-instruct",
                    "meta-llama/llama-3.1-8b-instruct",

                    "google/gemini-2.5-flash",
                    "google/gemini-2.5-flash-lite",
                    "google/gemini-2.5-pro",

                    "meta-llama/llama-3.3-70b-instruct:free",
                    "amazon/nova-2-lite-v1:free",
                    "mistralai/mistral-7b-instruct:free",
                    "kwaipilot/kat-coder-pro:free",
                    "tngtech/deepseek-r1t2-chimera:free",
                ],
                value=DEFAULT_MODEL_ID,
                allow_custom_value=True,
            )

            temperature_inp = gr.Slider(
                label="Temperature (0.0 = mazāka variācija, 1.0 = lielāka variācija)",
                minimum=0.0,
                maximum=1.0,
                value=0.2,
                step=0.05,
            )

            zip_input = gr.File(label="ZIP ar LKP skrejlapu .txt failiem")
            build_btn = gr.Button("Izveidot indeksu")
            build_status = gr.Textbox(label="Status", interactive=False)

            top_k_inp = gr.Slider(
                label="Cik fragmentus izmantot (top_k)?",
                minimum=1,
                maximum=30,
                value=12,
                step=1,
            )
            preview_chars_inp = gr.Slider(
                label="Cik simbolus rādīt katrā fragmenta preview?",
                minimum=50,
                maximum=1000,
                value=300,
                step=50,
            )

            min_score_inp = gr.Slider(
                label="Minimālais līdzības score (0 = izslēgts)",
                minimum=0.0,
                maximum=1.0,
                value=0.0,
                step=0.01,
            )

            show_full_chunks_box = gr.Checkbox(
                label="Rādīt pilnus fragmentus (nevis tikai preview)",
                value=False,
            )

            # Filtri
            date_from_box = gr.Textbox(
                label="Datums no (YYYY, YYYY-MM vai YYYY-MM-DD, tukšs – nav filtra)",
                placeholder="piem., 1934-01",
            )
            date_to_box = gr.Textbox(
                label="Datums līdz (YYYY, YYYY-MM vai YYYY-MM-DD, tukšs – nav filtra)",
                placeholder="piem., 1936-12",
            )

            print_run_min_box = gr.Number(
                label="Tirāža no (>=, tukšs – nav filtra)",
                value=None,
                precision=0,
            )
            print_run_max_box = gr.Number(
                label="Tirāža līdz (<=, tukšs – nav filtra)",
                value=None,
                precision=0,
            )

            include_unk_print_run_box = gr.Checkbox(
                label="Iekļaut skrejlapas ar nezināmu tirāžu (unk), ja ir tirāžas filtrs",
                value=True,
            )

            org_custom_box = gr.Textbox(
                label="Papildu organizācijas filtrs (brīvs teksts, pēc apakšvirknes)",
                placeholder="piem., LKP CK, Rīgas komiteja, Sarkanā palīdzība, VEF, Daugavpils",
            )

        with gr.Column():
            question_box = gr.Textbox(
                label="Jautājums par Latvijas komunistisko organizāciju skrejlapu korpusu",
                lines=3,
                placeholder=(
                    "Piemēram: Nosauc, kuriem komunistiem piesprieda nāvessodu "
                    "Ulmaņa režīma laikā!"
                ),
            )
            ask_btn = gr.Button("Uzdot jautājumu")
            retrieve_btn = gr.Button("Rādīt tikai fragmentus (bez LLM)")

            answer_out = gr.Markdown(label="Atbilde")
            chunks_out = gr.Textbox(
                label="Izmantotie fragmenti (preview vai pilni)",
                lines=20,
            )

            with gr.Row():
                log_btn = gr.Button("Izveidot и lejupielādēt žurnālu (JSON)")
                log_file_out = gr.File(
                    label="Žurnāla fails (lkp_rag_log.json)",
                    interactive=False,
                )

    build_btn.click(
        fn=build_rag_from_zip_gui,
        inputs=[zip_input],
        outputs=[build_status, chunks_out],
    )

    ask_btn.click(
        fn=qa_on_corpus_gui,
        inputs=[
            api_key_box,
            question_box,
            top_k_inp,
            preview_chars_inp,
            min_score_inp,
            model_choice_box,
            date_from_box,
            date_to_box,
            print_run_min_box,
            print_run_max_box,
            org_custom_box,
            include_unk_print_run_box,
            show_full_chunks_box,
            temperature_inp,
        ],
        outputs=[answer_out, chunks_out],
    )

    retrieve_btn.click(
        fn=retrieve_only_gui,
        inputs=[
            question_box,
            top_k_inp,
            preview_chars_inp,
            min_score_inp,
            date_from_box,
            date_to_box,
            print_run_min_box,
            print_run_max_box,
            org_custom_box,
            include_unk_print_run_box,
            show_full_chunks_box,
        ],
        outputs=[chunks_out],
    )

    log_btn.click(
        fn=get_log_file_for_gui,
        inputs=[],
        outputs=[log_file_out],
    )


if __name__ == "__main__":
    gui.launch(inbrowser=True)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://db4e31ac353a7db1e1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
