In [1]:
import os
import re
import glob
import faiss
import torch
from pathlib import Path
from config import DevelopmentConfig, ProductionConfig, ModelConfig
from flask import Flask, request, jsonify
from flask_cors import CORS
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import jinja2
import json
from collections import defaultdict

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [5]:
# Local model configuration 
files_path = Path("/home/vicente/Github/BDLab-Agent/backend/data/kaggle/us-senate-bill")
files_list = list(files_path.glob("*.txt"))
total_files = len(files_list)


def text_splitter(text, chunk_size=200, chunk_overlap=50):
    """
    Splits a text into overlapping chunks manually.
    """
    if chunk_overlap >= chunk_size:
        raise ValueError("chunk_overlap must be smaller than chunk_size.")

    chunks = []
    start_index = 0

    # loop to capture chunks with overlap
    while start_index < len(text):
        end_index = start_index + chunk_size

        chunk = text[start_index:end_index]
        chunks.append(chunk)
        
        start_index += chunk_size - chunk_overlap

    return chunks

# Read the .txt files and split them into chunks: [{file_id, title, chunk_id}]
raw_chunks = []
chunk_metadata = []
file_titles = {}

for path in files_list:
    text = path.read_text(encoding="utf-8")
    file_id = path.stem
    title = path.name
    file_titles[file_id] = title
  
    # split the entire document text with overlap
    document_chunks = text_splitter(text, chunk_size=200, chunk_overlap=50)

    for chunk_id, chunk in enumerate(document_chunks):

        # this adds enrichment to each chunk so the embedding captures a more complete context representation
        chunk_with_title = f"From the bill titled '{title}': {chunk}"
        raw_chunks.append(chunk_with_title)
        chunk_metadata.append({
            "file_id": file_id,
            "title": title,
            "chunk_id": chunk_id
        })
        
print(f"Loaded {total_files} files, created {len(raw_chunks)} chunks.")

Loaded 7 files, created 3886 chunks.


In [None]:
#lets look at the chunks
for i in range(3):
    print("\n==========================================================================\n")
    print(f"CHUNK {i}:\n")
    print(f"Text: {raw_chunks[i]}\n")
    print(f"Metadata: {chunk_metadata[i]}\n")




CHUNK 0:

Text: From the bill titled 'Trooper Werner Foerster and Frank Connor Justice Act.txt': A BILL
To call for the immediate extradition or return to the United States of convicted felon Joanne Chesimard, William “Guillermo” Morales, and all other fugitives who are receiving safe haven in Cuba to escape prosecution or confinement for criminal offenses committed in the United States.

Be it enacted by the Senate and House of Representatives of the United States of America in Congress assembled,

SECTION 1. SHORT TITLE.

This Act may be cited as the “Trooper Werner Foerster and Frank Con

Metadata: {'file_id': 'Trooper Werner Foerster and Frank Connor Justice Act', 'title': 'Trooper Werner Foerster and Frank Connor Justice Act.txt', 'chunk_id': 0}



CHUNK 1:

Text: From the bill titled 'Trooper Werner Foerster and Frank Connor Justice Act.txt': ited as the “Trooper Werner Foerster and Frank Connor Justice Act”.

SEC. 2. FINDINGS.

Congress makes the following findings:

(1) Joann

In [6]:
embed_model = "data/embeddings/gte-large"
embedder = SentenceTransformer(embed_model)

# embed the chunks
chunk_embs = embedder.encode(
    raw_chunks,
    convert_to_numpy=True,
    show_progress_bar=True
).astype(np.float32)

# normalize for cosine similarity search
faiss.normalize_L2(chunk_embs)

dim = int(chunk_embs.shape[1])
index = faiss.IndexFlatIP(dim)
index.add(chunk_embs)

Batches:   0%|          | 0/122 [00:00<?, ?it/s]

In [7]:
# lets check the index
print(f"FAISS index contains {index.ntotal} vectors of dimension {dim}")

# lets also look at some embeddings
for i in range(3):
    print(f"Embedding for chunk {i}: {chunk_embs[i][:6]}")


FAISS index contains 3886 vectors of dimension 1024
Embedding for chunk 0: [-0.0081767   0.00876884 -0.01092891  0.05729601 -0.0350116  -0.02136488]
Embedding for chunk 1: [ 0.00775521  0.01174085 -0.01212076  0.05886779 -0.02789694 -0.00757028]
Embedding for chunk 2: [-0.0080538  -0.020599    0.02262611  0.04469621 -0.0261443  -0.03361754]


In [8]:
base = Path("/home/vicente/Github/BDLab-Agent/backend/data/GPTModels/gpt-oss-20b")
model = AutoModelForCausalLM.from_pretrained(str(base),dtype=torch.bfloat16,device_map="auto",local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(str(base),local_files_only=True)
model.eval()


Fetching 40 files:   0%|          | 0/40 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching 40 files:   0%|          | 0/40 [00:00<?, ?it/s]

GptOssForCausalLM(
  (model): GptOssModel(
    (embed_tokens): Embedding(201088, 2880, padding_idx=199999)
    (layers): ModuleList(
      (0-23): 24 x GptOssDecoderLayer(
        (self_attn): GptOssAttention(
          (q_proj): Linear(in_features=2880, out_features=4096, bias=True)
          (k_proj): Linear(in_features=2880, out_features=512, bias=True)
          (v_proj): Linear(in_features=2880, out_features=512, bias=True)
          (o_proj): Linear(in_features=4096, out_features=2880, bias=True)
        )
        (mlp): GptOssMLP(
          (router): GptOssTopKRouter()
          (experts): Mxfp4GptOssExperts()
        )
        (input_layernorm): GptOssRMSNorm((2880,), eps=1e-05)
        (post_attention_layernorm): GptOssRMSNorm((2880,), eps=1e-05)
      )
    )
    (norm): GptOssRMSNorm((2880,), eps=1e-05)
    (rotary_emb): GptOssRotaryEmbedding()
  )
  (lm_head): Linear(in_features=2880, out_features=201088, bias=False)
)

## Architectural breakdown

model = AutoModel... loaded a model with these advanced features:

- Causal LM: It's designed for text generation.
- Deep Architecture: It has 24 layers to build a deep understanding of text.
- Mixture of Experts (MoE): This is its most defining feature. It uses a router to select from a pool of specialist networks, making it very parameter-efficient during inference.
- Grouped-Query Attention (GQA): It uses a modern, efficient attention mechanism to speed up processing and save memory.
- Modern Components: It uses RMSNorm for normalization and RotaryEmbedding (rotary_emb) for positional encoding, which are both state-of-the-art techniques.

Finally, the parameters you used in your code are what make it possible to run this beast:

- dtype=torch.bfloat16: Loads the model in a 16-bit format instead of 32-bit, effectively cutting the memory requirement in half.
- device_map="auto": Intelligently distributes the 24 layers across your available hardware (e.g., multiple GPUs) if one isn't big enough to hold the entire model.

In [None]:
print(tokenizer.chat_template)

{#-
  In addition to the normal inputs of `messages` and `tools`, this template also accepts the
  following kwargs:
  - "builtin_tools": A list, can contain "browser" and/or "python".
  - "model_identity": A string that optionally describes the model identity.
  - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
 #}

{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
    {%- if param_spec.type == "array" -%}
        {%- if param_spec['items'] -%}
            {%- if param_spec['items']['type'] == "string" -%}
                {{- "string[]" }}
            {%- elif param_spec['items']['type'] == "number" -%}
                {{- "number[]" }}
            {%- elif param_spec['items']['type'] == "integer" -%}
                {{- "number[]" }}
            {%- elif param_spec['items']['type'] == "boolean" -%}
                {{- "boolean[]" }}
            {%- else -%}
                {%- set inner_type = render_typescrip

In [10]:
def chat_oss(user_prompt, system_prompt=None, max_new_tokens=512, do_sample=True, temperature=0.1):
    """
    Core function to generate a response from the LLM without external context.
    """
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": user_prompt})

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    gen_kwargs = {
        "max_new_tokens": max_new_tokens,
        "do_sample": do_sample,
        "pad_token_id": tokenizer.eos_token_id
    }
    if do_sample:
        gen_kwargs["temperature"] = temperature

    with torch.inference_mode():
        outputs = model.generate(**inputs, **gen_kwargs)

    prompt_length = inputs["input_ids"].shape[1]
    raw_output = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
    
    clean_response = raw_output.split("assistantfinal")[-1].strip()
    if clean_response.startswith("analysis"):
        clean_response = clean_response[len("analysis"):].strip()
        
    return clean_response

# A simple system prompt to keep the model helpful.
base_system_prompt = "You are a helpful assistant."

question = "What were the key findings of Project Minerva according to the final report?"

# Calling the model without any external context.
response = chat_oss(
    user_prompt=question,
    system_prompt=base_system_prompt,
    do_sample=False
)

print(f"Question: {question}\n")
print(f"Model's Answer (Without RAG): \n{response}")

Question: What were the key findings of Project Minerva according to the final report?

Model's Answer (Without RAG): 
The user asks: "What were the key findings of Project Minerva according to the final report?" We need to answer. But we need to check if "Project Minerva" is a real or fictional project. It might be a fictional or a real project. We need to see if there's any known Project Minerva. There's a Project Minerva in the context of the "Minerva" project by the US Army? Or maybe it's a fictional project. There's also "Project Minerva" in the context of the "Minerva" project by the US Department of Defense? There's also "Project Minerva" in the context of the "Minerva" project by the US Army's "Project Minerva" to develop a new type of soldier. But I'm not sure. There's also "Project Minerva" in the context of the "Minerva" project by the US Army's "Project Minerva" to develop a new type of soldier. But I'm not sure. There's also "Project Minerva" in the context of the "Minerva

## We have a working model but we need to add the context

### Here is what we did:
- You have your knowledge base loaded, chunked, and indexed for fast semantic searching. 
- This is the most computationally intensive part.

Next: We build the "Augmented Generation" part.

Here are the fundamental steps to complete the pipeline, turning your indexed data into an interactive Q&A system.

## The RAG Workflow from Here:

1. Get a User's Question: The process starts when a user asks a question (e.g., "What is this bill about?").

2. Retrieve Relevant Context: We'll take that question, embed it using the same gte-large model, and use our FAISS index to find the top 3-5 most relevant text chunks from the 1004 you indexed.

3. Create a Prompt: We will create a special prompt for the LLM. This prompt will contain both the user's original question and the relevant chunks we just retrieved. The prompt essentially says: "Hey LLM, answer this question, but base your answer only on the context I'm providing you."

4. Generate the Answer: We'll send this combined prompt to your gpt-oss-20b model, which will then generate a response that is grounded in the retrieved documents.

### Next we do step 2: Create the retgriever function

In [11]:
def retrieve_context(query, k=3):
    """
    Retrieves the top-k most relevant chunks from the FAISS index for a given query.
    """
    print(f"Retrieving context for query: '{query}'")

    # embed the query
    query_emb = embedder.encode([query], convert_to_numpy=True).astype(np.float32)

    # normalize the query embedding (for cosine similarity)
    faiss.normalize_L2(query_emb)

    # search the FAISS index
    distances, indices = index.search(query_emb, k)

    # fetch the actual text chunks using the indices
    retrieve_chunks_text = []
    for i in indices[0]:
        retrieve_chunks_text.append(raw_chunks[i])

    retrieved_chunks_meta = []
    for i in indices[0]:
        retrieved_chunks_meta.append(chunk_metadata[i])

    # we combine single context string
    context = "\n\n---\n\n".join(retrieve_chunks_text)

    print("Context retrieved successfully.")

    # Return both the context string AND the list of metadata dictionaries
    return context, retrieved_chunks_meta

## Combine Everything into a Q&A Function

Now we'll create a main function that orchestrates the whole process: retrieving the context, formatting the special prompt, and calling your chat_oss LLM function to get the final answer.

In [12]:
def ask_rag(query):
    """
    The complete RAG pipeline.
    Retrieves context, builds a prompt, and generates an answer with sources.
    """
    # First retrieve context
    retrieved_context, sources = retrieve_context(query, k=3)

    # Now we create the RAG prompt:
    # This Combine the context and query into a single prompt for the LLM, (instructing it on how to behave)
    augmented_prompt  = """
        You are a helpful assistant for answering questions about US Senate bills and Acts.
        Use the following context to answer the user's question.
        If the answer is not found in the context, state that you cannot find the answer in the provided documents.
        Do not use any external knowledge or make up information.

        (START CONTEXT): {context} (END CONTEXT).

        USER QUESTION: {question} """.strip()

    # based on the prompt template, we create the final prompt text passing in the retrieved context and user question
    final_prompt_text = augmented_prompt.format(context=retrieved_context, question=query)

    print("\nGENERATING RESPONSE:\n")

    # passing the fully formatted RAG prompt as the "user_prompt"
    response = chat_oss(final_prompt_text, max_new_tokens=512)

    # print("SOURCES USED:\n")
    # for i, meta in enumerate(sources):
    #     print(f"Source {i+1}: {meta['title']} (Chunk ID: {meta['chunk_id']})")
    
    return response, sources
  

## FInally we can ask questions


In [13]:
# Example 1:
response, sources = ask_rag("Who is Joanne Chesimard and what did she do?")
print("\n\nFINAL ANSWER:\n")
print(response)

Retrieving context for query: 'Who is Joanne Chesimard and what did she do?'
Context retrieved successfully.

GENERATING RESPONSE:



FINAL ANSWER:

**Joanne Chesimard** is the individual referenced in the “Trooper Werner Foerster and Frank Connor Justice Act.” According to the bill:

* She is listed on the Federal Bureau of Investigation’s records.  
* She was a member of the Black Liberation Army extremist organization.  
* On **May 2, 1973**, she and two accomplices opened fire on two New Jersey State Police officers (the text is cut off but indicates a shooting incident).  
* She was convicted of murder and sentenced to life imprisonment.  
*


In [14]:
print(sources)

[{'file_id': 'Trooper Werner Foerster and Frank Connor Justice Act', 'title': 'Trooper Werner Foerster and Frank Connor Justice Act.txt', 'chunk_id': 8}, {'file_id': 'Trooper Werner Foerster and Frank Connor Justice Act', 'title': 'Trooper Werner Foerster and Frank Connor Justice Act.txt', 'chunk_id': 3}, {'file_id': 'Trooper Werner Foerster and Frank Connor Justice Act', 'title': 'Trooper Werner Foerster and Frank Connor Justice Act.txt', 'chunk_id': 5}]


In [17]:
# summarization-style question
response2, sources2 = ask_rag("Provide a summary of the 'Trooper Werner Foerster and Frank Connor Justice Act'.")

print(response2)

Retrieving context for query: 'Provide a summary of the 'Trooper Werner Foerster and Frank Connor Justice Act'.'
Context retrieved successfully.

GENERATING RESPONSE:

**Summary of the “Trooper Werner Foerster and Frank Connor Justice Act”**

- **Purpose**: The Act seeks to compel the immediate extradition or return to the United States of convicted felon **Joanne Chesimard**, William “Guillermo” Morales, and any other fugitives who are believed to be receiving safe haven in Cuba in order to avoid prosecution or confinement for crimes committed in the United States.

- **Background on Joanne Chesimard**:
  - Chesimard, a member of the Black Liberation Army, opened fire on two New Jersey State troopers on May 2, 1973, wounding one trooper and killing State Trooper **Werner Foerster** at point‑blank range.
  - She was found guilty of first‑degree murder in a six‑week trial in March 1977 and sentenced to life imprisonment.
  - On November 2, 1979, she escaped from the Edna Mahan Correctio

In [18]:
print(sources2)

[{'file_id': 'Trooper Werner Foerster and Frank Connor Justice Act', 'title': 'Trooper Werner Foerster and Frank Connor Justice Act.txt', 'chunk_id': 0}, {'file_id': 'Trooper Werner Foerster and Frank Connor Justice Act', 'title': 'Trooper Werner Foerster and Frank Connor Justice Act.txt', 'chunk_id': 2}, {'file_id': 'Trooper Werner Foerster and Frank Connor Justice Act', 'title': 'Trooper Werner Foerster and Frank Connor Justice Act.txt', 'chunk_id': 1}]
