In [None]:
# Install dependencies
!pip install faiss-cpu --quiet
!pip install -U bitsandbytes --quiet

In [None]:
import os
import re
import json
import torch
import numpy as np
import faiss
from pathlib import Path
from transformers import
   (AutoTokenizer, AutoModel,
    AutoModelForCausalLM, BitsAndBytesConfig,
    pipeline)
from tqdm.notebook import tqdm
from huggingface_hub import login
import pandas as pd
from typing import List, Dict, Tuple

# Paths and settings
ISDE_TXT_PATH = Path("/content/ISDE 4.5.txt")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TOP_K = 10

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Login to HuggingFace (replace with token)
login(token="YOURTOKENHERE")

print(f"Using device: {DEVICE}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m124.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Chunking and Cleaning

def chunk_text_articles(text: str) -> List[Dict[str, str]]:
    """
    Chunking function that preserves article structure and metadata.
    Returns chunks with article reference for better tracability.
    """
    lines = text.splitlines()
    chunks = []
    current_article = None
    current_section = []

    for line in lines:
        # Detect article headers
        article_match = re.match(r'^Artikel\s+(\d+\.\d+\.\d+)', line)
        if article_match:
            if current_section and current_article:
                chunk_text = " ".join(current_section).strip()
                if len(chunk_text) > 40:
                    chunks.append({
                        "text": chunk_text,
                        "article": current_article,
                        "full_text": f"Artikel {current_article}: {chunk_text}"
                    })
            current_article = article_match.group(1)
            current_section = [line]
        # Detect sub-points (1°., a., etc.)
        elif re.match(r'^\s*(\d+°?\.|\w\.)\s+', line):
            if current_section and len(" ".join(current_section)) > 100:
                chunk_text = " ".join(current_section).strip()
                if current_article:
                    chunks.append({
                        "text": chunk_text,
                        "article": current_article,
                        "full_text": f"Artikel {current_article}: {chunk_text}"
                    })
                current_section = [line]
            else:
                current_section.append(line.strip())
        else:
            if line.strip():
                current_section.append(line.strip())

    if current_section and current_article:
        chunk_text = " ".join(current_section).strip()
        if len(chunk_text) > 40:
            chunks.append({
                "text": chunk_text,
                "article": current_article,
                "full_text": f"Artikel {current_article}: {chunk_text}"
            })

    return chunks

def clean_isde_chunks_enhanced(chunks: List[Dict[str, str]]) -> List[Dict[str, str]]:
    """
    Chunk cleaning function that preserves important legal content.
    """
    filtered = []
    for chunk in chunks:
        text = chunk["text"].strip().lower()
        # Skip metadata and references
        if any(prefix in text for prefix in ["geldend van", "datum", "t/m", "zie:", "zie artikel"]):
            continue
        # Skip pure definitions without substance, unless they mention key terms
        if re.match(r"^(een.*natuurlijke persoon.*|.*wordt verstaan onder.*)$", text):
            if "subsidie" not in text and "investering" not in text:
                continue
        # Keep important content
        if any(keyword in text for keyword in [
            "subsidie", "investering", "warmtepomp", "isolatie",
            "zonneboiler", "windturbine", "warmtenet", "eigenaar-bewoner",
            "voorwaarden", "bedraagt", "€", "vierkante meter", "vermogen",
            "toegewezen"
        ]):
            filtered.append(chunk)
    return filtered

In [None]:
# Embedding and Advanced Retrieval

def embed_bert_with_pooling(texts: List[str], model, tokenizer, batch_size: int = 16) -> np.ndarray:
    """
    BERT embedding with mean pooling over all tokens.
    This often gives better semantic representations for longer texts.
    """
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding with BERT"):
        batch = tokenizer(
            texts[i:i+batch_size],
            padding=True, truncation=True,
            max_length=512, return_tensors="pt"
        ).to(DEVICE)

        with torch.no_grad():
            outputs = model(**batch)
            attention_mask = batch['attention_mask']
            token_embeddings = outputs.last_hidden_state
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            embs.append(mean_embeddings.cpu().numpy())

    M = np.vstack(embs)
    # Normalize embeddings
    return M / (np.linalg.norm(M, axis=1, keepdims=True) + 1e-10)


def extract_keywords_enhanced(query: str, min_len: int = 3) -> List[str]:
    """
    Keyword extraction with filtering and numeric pattern matching.
    """
    tokens = re.findall(r'\b[\w°²µ%€]+\b', query.lower())
    stopwords = {"de", "het", "een", "en", "van", "voor", "in", "op", "aan", "binnen",
                "is", "zijn", "wordt", "werden", "was", "waren", "heeft", "hebben"}

    keywords = [t for t in tokens if len(t) >= min_len and t not in stopwords and not t.isdigit()]
    numeric_patterns = re.findall(r'\d+\s*(?:m²|kw|maand|mnd|jaar)', query.lower())
    keywords.extend(numeric_patterns)
    return list(set(keywords))

def retrieve_chunks_with_reranking(
    query: str,
    chunks: List[Dict[str, str]],
    embeddings: np.ndarray,
    index: faiss.Index,
    embed_func,
    model, tokenizer,
    k: int = TOP_K,
    rerank_pool: int = 30
) -> Tuple[List[int], List[float], List[Dict[str, str]]]:
    """
    Advanced retrieval with semantic search + keyword matching + reranking.
    """
    # Initial semantic retrieval
    q_emb = embed_func([query], model, tokenizer)[0]
    D, I = index.search(q_emb.reshape(1, -1), rerank_pool)

    # Extract keywords from query
    keywords = extract_keywords_enhanced(query)

    # Rerank based on multiple factors
    candidates = []
    for idx in I[0]:
        if idx == -1: continue  # Skip if no match found
        chunk = chunks[idx]
        text_lower = chunk["text"].lower()

        # Semantic similarity
        semantic_score = float(embeddings[idx] @ q_emb)

        # Keywords
        keyword_score = sum(1 for kw in keywords if kw in text_lower) / max(len(keywords), 1)

        # Legal term density
        legal_terms = ["subsidie", "voorwaarden", "investering", "eigenaar-bewoner",
                      "bedraagt", "indien", "betreft", "aanvraag"]
        legal_density = sum(1 for term in legal_terms if term in text_lower) / len(legal_terms)

        # Combined score
        combined_score = (
            0.6 * semantic_score +
            0.3 * keyword_score +
            0.1 * legal_density
        )

        candidates.append((int(idx), combined_score, chunk))

    # Sort by combined score
    candidates.sort(key=lambda x: x[1], reverse=True)

    # Return top k
    chosen_idxs = [c[0] for c in candidates[:k]]
    chosen_scores = [c[1] for c in candidates[:k]]
    chosen_chunks = [c[2] for c in candidates[:k]]

    return chosen_idxs, chosen_scores, chosen_chunks

In [None]:
# Prompting and Extraction Functions

def prepare_prompt_content(query: str, chunk: Dict[str, str]) -> str:
    """
    Generate a prompt with strict, explicit schema definition for the model to follow.
    """

    content = f"""Jouw taak is om de WETTEKST te analyseren en de juridische regels die van toepassing zijn op het SCENARIO te extraheren. Je MOET de output formatteren als een JSON-object volgens het onderstaande SCHEMA.

    --- SCHEMA DEFINITIE ---
    Je moet een lijst van "frames" teruggeven. Elk frame in de lijst moet de volgende exacte structuur hebben:
    {{
      "actor": "STRING // Wie of wat voert de actie uit?",
      "action": "STRING // Welke handeling wordt er uitgevoerd?",
      "object": "STRING // Waarop heeft de handeling betrekking?",
      "conditions": "STRING // ALLE voorwaarden waaraan voldaan moet worden, letterlijk uit de tekst.",
      "results": "STRING // ALLE juridische gevolgen als aan de voorwaarden is voldaan, letterlijk uit de tekst."
    }}
    --- EINDE SCHEMA ---

    Hier is de taak:

    SCENARIO: "{query}"

    WETTEKST:
    {chunk['text']}

    --- FINALE INSTRUCTIE ---
    Genereer een JSON-object met een "frames"-lijst. Elk frame MOET de structuur van het hierboven gedefinieerde SCHEMA volgen.
    Extraheer de informatie UITSLUITEND uit de WETTEKST. Uw antwoord moet een neutrale, derdepersoonsanalyse van de wet zijn.
    Als de WETTEKST geen relevante informatie bevat voor het scenario, geef dan een lege lijst terug: `{{"frames": []}}`.
    """
    return content

def extract_json_enhanced(text: str) -> dict:
    """
    JSON extraction with regex patterns.
    """
    # Find the full json object.
    json_match = re.search(r'\{\s*\"frames\"\s*:\s*\[.*\]\s*\}', text, re.DOTALL)
    if json_match:
        try:
            json_str = json_match.group(0)
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass

    # Find content inside the frame braces.
    frame_match = re.search(r'\{[^{}]*\"actor\"[^{}]*\"action\"[^{}]*\"object\"[^{}]*\"conditions\"[^{}]*\"results\"[^{}]*\}', text, re.DOTALL)
    if frame_match:
        try:
            frame_str = frame_match.group(0)
            frame = json.loads(frame_str)
            return {"frames": [frame]}
        except json.JSONDecodeError:
            pass

    return {"frames": []}

In [None]:
# Data Collection Function

def extract_generation_attempts_for_test_case(
    query: str,
    generator,
    tokenizer,
    chunks: List[Dict[str, str]],
    embeddings: np.ndarray,
    index: faiss.Index,
    embed_func,
    embed_model,
    embed_tokenizer,
    top_k: int = 10
) -> Tuple[List[Dict], List[Dict]]:
    """
    Main function to extract generation attempts for a given test case.
    """
    # Retrieve chunks
    idxs, scores, retrieved_chunks = retrieve_chunks_with_reranking(
        query, chunks, embeddings, index, embed_func, embed_model, embed_tokenizer, k=top_k
    )
    print(f"  > Retrieved {len(retrieved_chunks)} chunks for query: '{query[:50]}...'")
    if not retrieved_chunks:
        return [], []

    stored_chunks = [{"text": c["text"], "article": c["article"], "retrieval_score": float(s)} for c, s in zip(retrieved_chunks, scores)]

    # Prepare a list of all prompts
    prompts_to_generate = []
    for chunk in retrieved_chunks:
        prompt_content = prepare_prompt_content(query, chunk)
        formatted_prompt = tokenizer.apply_chat_template(
            [{"role": "user", "content": prompt_content}],
            tokenize=False,
            add_generation_prompt=True
        )
        prompts_to_generate.append(formatted_prompt)

    # Call the generator with the entire list of prompts
    outputs = generator(
        prompts_to_generate,
        max_new_tokens=384,
        temperature=0.2,
        do_sample=True,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,
        batch_size=4
    )

    # Loop through the list of outputs
    generation_attempts = []
    for i, (output, chunk) in enumerate(zip(outputs, retrieved_chunks)):
        raw_text = ""
        if output and isinstance(output, list) and 'generated_text' in output[0]:
            # Remove the original prompt from the generated text
            raw_text = output[0]['generated_text'].replace(prompts_to_generate[i], "").strip()

        parsed_json = extract_json_enhanced(raw_text)

        attempt = {
            "source_article": chunk["article"],
            "retrieval_score": float(scores[i]),
            "raw_model_output": raw_text,
            "parsed_frames": parsed_json.get("frames", [])
        }
        generation_attempts.append(attempt)

    return stored_chunks, generation_attempts

# Deduplicate the returned frames
def deduplicate_frames(frames: List[dict], scores: List[float]) -> List[dict]:
    if not frames: return []
    frame_groups = {}
    for i, frame in enumerate(frames):
        key = (frame.get("action", "").lower().strip(), frame.get("object", "").lower().strip())
        if key not in frame_groups: frame_groups[key] = []
        frame_groups[key].append((frame, scores[i]))

    unique_frames = []
    for key, group in frame_groups.items():
        if not group: continue
        best_frame, best_score = max(group, key=lambda item: (item[1], len(item[0].get("conditions", "")), len(item[0].get("results", ""))))
        unique_frames.append(best_frame)
    return unique_frames

In [None]:
# Main Execution and Testing Harness
import time
import gc

# Define Models and Test Cases
EMBEDDING_MODELS_TO_TEST = [
    "Gerwin/legal-bert-dutch-english",
    "pdelobelle/robbert-v2-dutch-base"
]

MODELS_TO_TEST = [
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "mistralai/Mistral-7B-Instruct-v0.1",
    "BramVanroy/GEITje-7B-ultra",
    "ReBatch/Llama-3-8B-dutch",
    "BramVanroy/fietje-2b-chat",
    "ArliAI/Llama-3.1-8B-ArliAI-Formax-v1.0"
]

TEST_CASES = [
    {"id": "TC01", "description": "Ik heb in mijn koopwoning de vloerisolatie met biobased isolatiemateriaal over een oppervlakte van 30 m² laten installeren. De werkzaamheden zijn vorige maand afgerond. Wat is het te verwachten subsidiebedrag?"},
    {"id": "TC02", "description": "Ik wil een lucht-waterwarmtepomp met een thermisch vermogen van 12 kW en een A+++ energielabel installeren in mijn woning. De installatie vindt plaats in augustus 2024. Hoe wordt de subsidie berekend?"},
    {"id": "TC03", "description": "Kom ik in aanmerking voor subsidie als ik een zonneboiler laat installeren met een apertuuroppervlakte van 12 vierkante meter? Het is een zonneboiler met energie-efficiëntieklasse A."},
    {"id": "TC04", "description": "Mijn huis is recent aangesloten op het warmtenet en mijn gasmeter is vorige week verwijderd. Ik wil nu ook subsidie aanvragen voor de eenmalige aanschaf van een nieuwe elektrische kookvoorziening. Is dit mogelijk en wat zijn de voorwaarden?"},
]

# Main Testing Loop
output_dir = Path("./thesis_results_final/")
output_dir.mkdir(parents=True, exist_ok=True)

# Load and process text data
print("Loading and chunking ISDE text...")
raw_text = ISDE_TXT_PATH.read_text(encoding="utf-8")
chunks = chunk_text_articles(raw_text)
chunks = clean_isde_chunks_enhanced(chunks)
print(f"Created {len(chunks)} clean, article-aware chunks.")
chunk_texts_for_embedding = [c['full_text'] for c in chunks]


# Outer loop for embedding models
for embed_model_name in EMBEDDING_MODELS_TO_TEST:
    print("=" * 80)
    print(f"Now testing with EMBEDDING MODEL: {embed_model_name}")
    print("=" * 80)

    # Setup embedding model and FAISS index
    print(f"  > Loading embedding model: {embed_model_name}...")
    embed_tokenizer = AutoTokenizer.from_pretrained(embed_model_name)
    embed_model = AutoModel.from_pretrained(embed_model_name).to(DEVICE).eval()

    print("  > Creating embeddings for all chunks...")
    embeddings = embed_bert_with_pooling(chunk_texts_for_embedding, embed_model, embed_tokenizer)

    print("  > Building FAISS index...")
    dim = embeddings.shape[1]
    index = faiss.IndexHNSWFlat(dim, 32)
    index.add(embeddings)
    print("  > FAISS index built successfully for this embedding model.")

    # Inner loop for generative models
    for gen_model_name in MODELS_TO_TEST:
        print("-" * 80)
        print(f"Now testing GENERATIVE MODEL: {gen_model_name}")
        print("-" * 80)

        model_results = []
        output_folder_name = f"embed_{embed_model_name.replace('/', '_')}__gen_{gen_model_name.replace('/', '_')}"
        model_output_dir = output_dir / output_folder_name
        model_output_dir.mkdir(parents=True, exist_ok=True)

        generator = None
        model = None
        tokenizer = None

        try:
            print(f"  > Loading generative model and tokenizer...")
            start_time = time.time()

            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=False,
            )
            tokenizer = AutoTokenizer.from_pretrained(gen_model_name, padding_side='left')


            if tokenizer.pad_token is None:
                print("  > tokenizer.pad_token is None. Setting it to tokenizer.eos_token")
                tokenizer.pad_token = tokenizer.eos_token

            if tokenizer.chat_template is None:
                 tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ message['content'] }}{% endif %}{% endfor %}"

            model = AutoModelForCausalLM.from_pretrained(
                gen_model_name,
                quantization_config=bnb_config,
                device_map="auto",
                trust_remote_code=True
            )

            model.config.pad_token_id = tokenizer.pad_token_id

            generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
            print(f"  > Generative model loaded in {time.time() - start_time:.2f} seconds.")

            for test_case in TEST_CASES:
                print(f"\n  Processing Test Case: {test_case['id']} - {test_case['description']}")
                start_time = time.time()

                retrieved_chunks, generation_attempts = extract_generation_attempts_for_test_case(
                    test_case["description"], generator, tokenizer, chunks, embeddings, index,
                    embed_bert_with_pooling, embed_model, embed_tokenizer, top_k=10
                )

                all_parsed_frames = []
                all_retrieval_scores = []
                for attempt in generation_attempts:
                    for frame in attempt["parsed_frames"]:
                        all_parsed_frames.append(frame)
                        all_retrieval_scores.append(attempt["retrieval_score"])

                final_unique_frames = deduplicate_frames(all_parsed_frames, all_retrieval_scores)
                print(f"  > Final unique frames extracted: {len(final_unique_frames)}")

                result_for_case = {
                    "embedding_model": embed_model_name,
                    "generative_model": gen_model_name,
                    "test_case_id": test_case["id"],
                    "test_case_description": test_case["description"],
                    "execution_time_seconds": time.time() - start_time,
                    "retrieved_chunks": retrieved_chunks,
                    "generation_attempts": generation_attempts,
                    "final_unique_frames": final_unique_frames,
                    "error": None
                }
                model_results.append(result_for_case)

        except Exception as e:
            print(f"AN ERROR OCCURRED with model {gen_model_name}: {e}")
            model_results.append({
                "embedding_model": embed_model_name,
                "generative_model": gen_model_name,
                "error": str(e)
            })
        finally:
            output_path = model_output_dir / "results.json"
            print(f"\n Saving results for this combination to {output_path}")
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(model_results, f, ensure_ascii=False, indent=2)

            del generator, model, tokenizer
            gc.collect()
            torch.cuda.empty_cache()
            print("  > Generative model memory cleared.")

    del embed_model, embed_tokenizer, embeddings, index
    gc.collect()
    torch.cuda.empty_cache()
    print(f"Embedding model {embed_model_name} memory cleared.")

print("\n\nAll model combinations tested successfully!")

Loading and chunking ISDE text...
Created 205 clean, article-aware chunks.
Now testing with EMBEDDING MODEL: Gerwin/legal-bert-dutch-english
  > Loading embedding model: Gerwin/legal-bert-dutch-english...


Some weights of the model checkpoint at Gerwin/legal-bert-dutch-english were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  > Creating embeddings for all chunks...


Embedding with BERT:   0%|          | 0/13 [00:00<?, ?it/s]

  > Building FAISS index...
  > FAISS index built successfully for this embedding model.
--------------------------------------------------------------------------------
Now testing GENERATIVE MODEL: mistralai/Mistral-7B-Instruct-v0.1
--------------------------------------------------------------------------------
  > Loading generative model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

  > tokenizer.pad_token is None. Setting it to tokenizer.eos_token


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Device set to use cuda:0


  > Generative model loaded in 96.33 seconds.

  Processing Test Case: TC01 - Ik heb in mijn koopwoning de vloerisolatie met biobased isolatiemateriaal over een oppervlakte van 30 m² laten installeren. De werkzaamheden zijn vorige maand afgerond. Wat is het te verwachten subsidiebedrag?


Embedding with BERT:   0%|          | 0/1 [00:00<?, ?it/s]

  > Retrieved 10 chunks for query: 'Ik heb in mijn koopwoning de vloerisolatie met bio...'
  > Final unique frames extracted: 9

  Processing Test Case: TC02 - Ik wil een lucht-waterwarmtepomp met een thermisch vermogen van 12 kW en een A+++ energielabel installeren in mijn woning. De installatie vindt plaats in augustus 2024. Hoe wordt de subsidie berekend?


Embedding with BERT:   0%|          | 0/1 [00:00<?, ?it/s]

  > Retrieved 10 chunks for query: 'Ik wil een lucht-waterwarmtepomp met een thermisch...'
  > Final unique frames extracted: 9

  Processing Test Case: TC03 - Kom ik in aanmerking voor subsidie als ik een zonneboiler laat installeren met een apertuuroppervlakte van 12 vierkante meter? Het is een zonneboiler met energie-efficiëntieklasse A.


Embedding with BERT:   0%|          | 0/1 [00:00<?, ?it/s]

  > Retrieved 10 chunks for query: 'Kom ik in aanmerking voor subsidie als ik een zonn...'
  > Final unique frames extracted: 10

  Processing Test Case: TC04 - Mijn huis is recent aangesloten op het warmtenet en mijn gasmeter is vorige week verwijderd. Ik wil nu ook subsidie aanvragen voor de eenmalige aanschaf van een nieuwe elektrische kookvoorziening. Is dit mogelijk en wat zijn de voorwaarden?


Embedding with BERT:   0%|          | 0/1 [00:00<?, ?it/s]

  > Retrieved 10 chunks for query: 'Mijn huis is recent aangesloten op het warmtenet e...'
  > Final unique frames extracted: 7

💾 Saving results for this combination to thesis_results_final/embed_Gerwin_legal-bert-dutch-english__gen_mistralai_Mistral-7B-Instruct-v0.1/results.json
  > Generative model memory cleared.
Embedding model Gerwin/legal-bert-dutch-english memory cleared.
Now testing with EMBEDDING MODEL: pdelobelle/robbert-v2-dutch-base
  > Loading embedding model: pdelobelle/robbert-v2-dutch-base...


Some weights of RobertaModel were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  > Creating embeddings for all chunks...


Embedding with BERT:   0%|          | 0/13 [00:00<?, ?it/s]

  > Building FAISS index...
  > FAISS index built successfully for this embedding model.
--------------------------------------------------------------------------------
Now testing GENERATIVE MODEL: mistralai/Mistral-7B-Instruct-v0.1
--------------------------------------------------------------------------------
  > Loading generative model and tokenizer...
  > tokenizer.pad_token is None. Setting it to tokenizer.eos_token


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


  > Generative model loaded in 35.66 seconds.

  Processing Test Case: TC01 - Ik heb in mijn koopwoning de vloerisolatie met biobased isolatiemateriaal over een oppervlakte van 30 m² laten installeren. De werkzaamheden zijn vorige maand afgerond. Wat is het te verwachten subsidiebedrag?


Embedding with BERT:   0%|          | 0/1 [00:00<?, ?it/s]

  > Retrieved 10 chunks for query: 'Ik heb in mijn koopwoning de vloerisolatie met bio...'
  > Final unique frames extracted: 10

  Processing Test Case: TC02 - Ik wil een lucht-waterwarmtepomp met een thermisch vermogen van 12 kW en een A+++ energielabel installeren in mijn woning. De installatie vindt plaats in augustus 2024. Hoe wordt de subsidie berekend?


Embedding with BERT:   0%|          | 0/1 [00:00<?, ?it/s]

  > Retrieved 10 chunks for query: 'Ik wil een lucht-waterwarmtepomp met een thermisch...'
  > Final unique frames extracted: 8

  Processing Test Case: TC03 - Kom ik in aanmerking voor subsidie als ik een zonneboiler laat installeren met een apertuuroppervlakte van 12 vierkante meter? Het is een zonneboiler met energie-efficiëntieklasse A.


Embedding with BERT:   0%|          | 0/1 [00:00<?, ?it/s]

  > Retrieved 10 chunks for query: 'Kom ik in aanmerking voor subsidie als ik een zonn...'
  > Final unique frames extracted: 8

  Processing Test Case: TC04 - Mijn huis is recent aangesloten op het warmtenet en mijn gasmeter is vorige week verwijderd. Ik wil nu ook subsidie aanvragen voor de eenmalige aanschaf van een nieuwe elektrische kookvoorziening. Is dit mogelijk en wat zijn de voorwaarden?


Embedding with BERT:   0%|          | 0/1 [00:00<?, ?it/s]

  > Retrieved 10 chunks for query: 'Mijn huis is recent aangesloten op het warmtenet e...'
  > Final unique frames extracted: 7

💾 Saving results for this combination to thesis_results_final/embed_pdelobelle_robbert-v2-dutch-base__gen_mistralai_Mistral-7B-Instruct-v0.1/results.json
  > Generative model memory cleared.
Embedding model pdelobelle/robbert-v2-dutch-base memory cleared.


All model combinations tested successfully!
