In [26]:
import os
import getpass
import requests
import numpy as np
import pandas as pd
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModel
import faiss

In [29]:
xlsx_path = "tb_chunks.xlsx"
df = pd.read_excel(xlsx_path) 

### 1.1. Mały klient HTTP do CLARIN

In [4]:
CLARIN_API_KEY = getpass.getpass("Podaj CLARIN API key (nie będzie wyświetlany): ")

if not CLARIN_API_KEY:
    raise ValueError("Nie podano CLARIN_API_KEY")

os.environ["CLARIN_API_KEY"] = CLARIN_API_KEY

In [5]:
CLARIN_BASE_URL = "https://services.clarin-pl.eu/api/v1/oapi"
CLARIN_API_KEY = os.environ.get("CLARIN_API_KEY")  
if CLARIN_API_KEY is None:
    raise ValueError("Ustaw zmienną środowiskową CLARIN_API_KEY z tokenem z WebServices CLARIN-PL.")

HEADERS = {
    "Authorization": f"Bearer {CLARIN_API_KEY}",
    "Content-Type": "application/json",
}

### 1.2. Lista modeli – /api/v1/oapi/models

In [6]:
def clarin_list_models():
    url = f"{CLARIN_BASE_URL}/models"
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    data = resp.json()
    return data

models = clarin_list_models()
print(models)

{'data': [{'id': 'bielik', 'full_name': 'speakleash/Bielik-11B-v2.2-Instruct', 'name': 'speakleash/Bielik-11B-v2.2-Instruct', 'type': 'chat'}, {'id': 'c4ai-command-a', 'full_name': 'CohereForAI/c4ai-command-a-03-2025', 'name': 'CohereForAI/c4ai-command-a-03-2025', 'type': 'chat'}, {'id': 'cohere', 'full_name': 'CohereForAI/c4ai-command-r-plus', 'name': 'CohereForAI/c4ai-command-r-plus', 'type': 'chat'}, {'id': 'gemma-3-4b-it', 'full_name': 'google/gemma-3-4b-it', 'name': 'google/gemma-3-4b-it', 'type': 'chat'}, {'id': 'dariah-eventstotriples', 'full_name': 'dariah/pllum12b-01_10', 'name': 'dariah/pllum12b-01_10', 'type': 'chat'}, {'id': 'clarin-affect-pllum', 'full_name': 'dariah/clarin-affect-pllum-autoregressive', 'name': 'dariah/clarin-affect-pllum-autoregressive', 'type': 'chat'}, {'id': 'llama3.1', 'full_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct', 'name': 'meta-llama/Meta-Llama-3.1-70B-Instruct', 'type': 'chat'}, {'id': 'llama-guard', 'full_name': 'meta-llama/Llama-Guard-3-8B

In [7]:
CLARIN_MODEL_NAME = "llama3.1"

Komentarz do wyboru Meta Llamy 3.1 70B Instruct: Najlepsze EN, najlepszy reasoning, najmniej halucynacji, idealny do RAG i raportów

### 1.3. Chat completions – /api/v1/oapi/chat/completions

In [8]:
def clarin_chat(messages, model: str = CLARIN_MODEL_NAME,
                temperature: float = 0.1,
                max_tokens: int = 1024) -> str:
    """
    messages: lista słowników {"role": "system"|"user"|"assistant", "content": "..."}
    """
    url = f"{CLARIN_BASE_URL}/chat/completions"
    payload = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
    }
    resp = requests.post(url, headers=HEADERS, json=payload)
    resp.raise_for_status()
    data = resp.json()
    # zgodnie z konwencją OpenAI
    return data["choices"][0]["message"]["content"]

In [9]:
test_messages = [
    {"role": "system", "content": "Jesteś asystentem, odpowiadasz krótko po polsku."},
    {"role": "user", "content": "Wyjaśnij w jednym zdaniu co to jest RAG (Retrievial Augmented Generation)."},
]
print(clarin_chat(test_messages))


RAG (Retrieval-Augmented Generation) to model językowy, który łączy mechanizmy wyszukiwania i generowania tekstu, aby poprawić jakość i dokładność wygenerowanego tekstu.


### 2. Krok 2 – ładujemy modele embeddingowe i używamy naszego indeksu regulacji jako retrievera

In [20]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [14]:
TEXT_COL =  "text"

embeddings_ds = Dataset.load_from_disk("regs_chunks_with_embeddings")

all_embs = np.vstack(embeddings_ds["embeddings"]).astype("float32")
num_docs, emb_dim = all_embs.shape
print(num_docs, emb_dim)

faiss_index = faiss.IndexFlatIP(emb_dim)
faiss_index.add(all_embs)

2836 768


### 2.1. Funkcja: semantic search -> top_k chunków

In [28]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [None]:
def embed_query(query: str) -> np.ndarray:
    
    encoded = tokenizer(
        [query],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    ).to(device)
    with torch.no_grad():
        output = model(**encoded)
        q_emb = cls_pooling(output)  # ta sama jak wcześniej
        q_emb = torch.nn.functional.normalize(q_emb, p=2, dim=1)
    return q_emb.cpu().numpy().astype("float32")  # (1, dim)


def retrieve_contexts_local(query: str, top_k: int = 8) -> list[dict]:
    q_emb = embed_query(query)      # (1, d)
    scores, idx = faiss_index.search(q_emb, top_k)   # idx: (1, k)
    scores = scores[0]
    idx = idx[0]

    rows = df.iloc[idx].copy()
    rows["score"] = scores

    contexts = []
    for _, row in rows.iterrows():
        contexts.append({
            "text": str(row[TEXT_COL]),
            "anchor": row.get("anchor"),
            "doc_id": row.get("doc_id"),
            "reg_no": row.get("reg_no"),
            "section_path": row.get("section_path"),
            "annex_id": row.get("annex_id"),
        })
    return contexts

### 3. Krok 3 – moduł generacji (RAG) z CLARIN

In [31]:
REPORT_SYSTEM_PROMPT = """
You are a senior homologation / regulatory engineer helping non-technical readers.

You receive:
- A description of a software or logic change in a vehicle system or function
  (for example: lighting, braking, ADAS, HMI, cybersecurity, diagnostics, etc.).
- Several passages from UN/EU automotive regulations (e.g. UNECE R48, R13, R79, R156, ...).

Your tasks:
1) Use ONLY the provided regulation passages as your legal source.
2) Produce a short, readable report in clear English, for non-technical people.
3) Always include explicit clause references (e.g. “UNECE R48 §6.5.8”) with citations.
4) If something is not covered by the context, clearly say “Not covered in the provided regulations”.

Output format (markdown):

# Regulatory impact report - {short_title}

## 1. Change summary
- 2-4 bullet points that restate the change in simple terms.

## 2. Potentially impacted functions
- Brief bullets: which vehicle functions / ECUs are affected
  (based ONLY on the change description and context).
- Explain in very simple terms what they do.

## 3. Applicable regulations and clauses
Create a small bullet list or table. For each relevant clause:
- Regulation and clause (e.g. "UNECE R48 §6.5.8").
- One-line explanation of what it requires.
- Why it is relevant to this change.

## 4. Compliance assessment for this change
For each clause from section 3, classify:
- Status: one of {PASS, POSSIBLE IMPACT - REVIEW, OUT OF SCOPE}.
- Short rationale in plain language, grounded in the context passages.
- If context is unclear or missing, mark POSSIBLE IMPACT - REVIEW and say what needs to be checked.

## 5. Suggested test plan & documentation
- 4-6 bullet points with high-level tests or checks
  (e.g. timing, behaviors in edge cases, malfunction warnings, traceability).
- Reference the relevant clauses when possible.
- Keep it non-technical (what needs to be checked, not how to code it).

Rules:
- Do not invent regulation text; stay within the given context.
- If multiple documents disagree, briefly mention the uncertainty.
- Keep wording short and business-oriented, not academic.
"""


### 3.2. Budowa promptu użytkownika (change + context)

In [32]:
import textwrap

def build_user_prompt_report(change_description: str,
                             context_chunks: list[dict]) -> str:
    ctx_lines = []
    for i, c in enumerate(context_chunks, 1):
        snippet = c["text"].replace("\n\n", "\n").strip()
        snippet = textwrap.shorten(snippet, width=1400, placeholder=" …")

        anchor = c.get("anchor")
        if not anchor:
            anchor = f"{c.get('reg_no', 'Regulation')} {c.get('section_path') or c.get('annex_id') or ''}".strip()

        ctx_lines.append(
            f"### [{i}] {anchor}\n"
            f"Source: {c.get('doc_id')}\n"
            f"{snippet}\n"
        )

    ctx_block = "\n\n".join(ctx_lines)

    user_msg = f"""
You are assessing one specific software / logic change.

Change description:
\"\"\"{change_description}\"\"\"

Below are the most relevant regulation excerpts:

{ctx_block}

Using ONLY these passages, write the report in the requested format.
If you are unsure, mark the item as REVIEW and explain what needs human validation.
"""
    return user_msg.strip()


### 3.3. Funkcja RAG: retrieval → CLARIN → odpowiedź

In [33]:
def run_regulatory_report_clarin(change_description: str,
                                 top_k: int = 8) -> str:
    # 1) retrieval
    contexts = retrieve_contexts_local(change_description, top_k=top_k)
    if not contexts:
        return "No relevant regulation passages were found in the index."

    # 2) budowa promptu
    user_prompt = build_user_prompt_report(change_description, contexts)
    short_title = change_description.strip().replace("\n", " ")[:80]

    # 3) wywołanie CLARIN chat completion
    messages = [
        {"role": "system", "content": REPORT_SYSTEM_PROMPT},
        {"role": "system", "content": f"Short title for the report: {short_title}"},
        {"role": "user", "content": user_prompt},
    ]
    answer = clarin_chat(messages, model=CLARIN_MODEL_NAME, temperature=0.2, max_tokens=2000)
    return answer


### 4. Przykładowe zapytania (use-case zmiany w softwarze)

In [34]:
change1 = """
LIGHT LEFT TURN.V2 - change from version R11 to R12:
- Turn indicator status signals changed from .Req to .Info
- Updated diagnostics and fallback if ETH_L_TurnCluster.Info is missing
- Revised animation when mirror turn signal shares light guide with position lamp
"""

change2 = """
Low Beam / High Beam logic
- Updated LB/HB requirements with new fallback conditions
- Modified high-beam signal mapping
- Clarified simultaneous LB+HB activation feasibility
"""

change3 = """
Direction indicator tell-tale behavior changed: visual indicator now flashes at a constant rate, 
even when one trailer indicator lamp fails.
"""

for ch in [change1, change2, change3]:
    print("="*80)
    print("CHANGE:\n", ch)
    print("\nRAG ANSWER (CLARIN):\n")
    print(run_regulatory_report_clarin(ch, top_k=8))
    print("\n\n")


CHANGE:
 
LIGHT LEFT TURN.V2 - change from version R11 to R12:
- Turn indicator status signals changed from .Req to .Info
- Updated diagnostics and fallback if ETH_L_TurnCluster.Info is missing
- Revised animation when mirror turn signal shares light guide with position lamp


RAG ANSWER (CLARIN):

# Regulatory Impact Report - LIGHT LEFT TURN.V2

## 1. Change Summary
* The turn indicator status signals have been updated from ".Req" to ".Info".
* Diagnostics and fallback behavior have been modified in case the ETH_L_TurnCluster.Info signal is missing.
* Animation has been revised when the mirror turn signal shares a light guide with the position lamp.

## 2. Potentially Impacted Functions
* Turn indicator lamps: These lamps indicate the intention of the driver to change direction or lanes.
* Position lamps: These lamps provide visibility to the vehicle's presence and dimensions.

## 3. Applicable Regulations and Clauses
| Regulation | Clause | Requirement | Relevance |
| --- | --- | ---