In [None]:
import torch
import torch.nn.functional as F
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from langchain.chat_models import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
import os

In [None]:
#enter your openai api key, if this is unavailable another model should be loaded instead of gpt-4o
os.environ["OPENAI_API_KEY"] = "sk-proj"

In [None]:
#load the embedding model on gpu
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    model_kwargs={"device": "cuda"},  # or "cpu" if no GPU
    encode_kwargs={"normalize_embeddings": True}
)

In [None]:
# Load FAISS index and retriever, embedding model should be the same as for the created embeddings in the vector database
faiss_index = FAISS.load_local("faiss_index_bge_m3", embedding_model, allow_dangerous_deserialization=True)
retriever = faiss_index.as_retriever(search_type="similarity", search_kwargs={"k": 1000})  #loads the 1000 most similar to query

In [None]:
# Load reranker model (it can be loaded for gpu but not in current code)
reranker_model_name = "BAAI/bge-reranker-v2-m3"
tokenizer = AutoTokenizer.from_pretrained(reranker_model_name)
model = AutoModelForSequenceClassification.from_pretrained(reranker_model_name)

def rerank(query: str, docs: list, top_k: int = 10):
    pairs = [(query, doc.page_content) for doc in docs]  #Assumes input to be langchain documents
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        logits = model(**inputs).logits.view(-1)
    
    scores = F.softmax(logits, dim=0)
    scored_docs = sorted(zip(docs, scores.tolist()), key=lambda x: x[1], reverse=True)
    top_docs = [doc for doc, score in scored_docs[:top_k]]
    return top_docs

In [None]:
# Initialize gpt-4o
llm = ChatOpenAI(
    model_name="gpt-4o",
    temperature=0,
    max_tokens=16384  # the maximum of output tokens gpt-4o can generate
)

In [None]:
# Create the prompt template
system_template = """
Your task is: {task}
You are acting as a {persona}. 

Use the following context to inform your answer, but keep in mind all Dutch-language texts are about Dutch educational policies. English-language texts are about international contexts:
-------------------
{context}
-------------------

Here are examples of how answers should be structured:
{exemplars}

Respond in a {tone} tone, and format the answer as follows:
{format}

Do not skip any of the documents given. 
Always give a result. Do not repeat the same results.
"""
#input variables can be filled in seperately
prompt = PromptTemplate(
    input_variables=["persona", "task", "context", "exemplars", "tone", "format"],
    template=system_template,
)

In [None]:
# Define your actual query/question for retrieval
query="Overview of the key laws, legal frameworks, regulations, rules and recent policy changes affecting education in the Netherlands."

In [None]:
#retrieve documents based on query
retrieved_docs = retriever.get_relevant_documents(query)
#rerank documents based on query and documents returning the 100 best documents and add metadata
#higher top-k is possible, but should still fit in the model context window
reranked_docs = rerank(query, retrieved_docs, top_k=100)
relevant_docs = "\n\n".join([
    f"Document ID: {doc.metadata.get('document_id', 'N/A')} | "
    f"Title: {doc.metadata.get('title', 'N/A')} | "
    f"Source: {doc.metadata.get('source', 'N/A')}\n"
    f"Chunk: {doc.metadata.get('chunk_index', 'N/A')} | "
    f"Chars: [{doc.metadata.get('chunk_char_start', 'N/A')}–{doc.metadata.get('chunk_char_end', 'N/A')}]\n"
    f"Content:\n{doc.page_content.strip()}"
    for doc in reranked_docs
])

In [None]:
#fills inputs in prompt template
#for every query this needs to be reran, to insert the correct retrieved documents or context
inputs_csv = {
    "persona": "senior educational policy analyst. Your task is to extract educational policy changes {subtask} from the given context",
    "task": "Extract as many educational policy changes {subtask} from the retrieved content.",
    "context": relevant_docs,  # Injected retrieved documents here
    "exemplars": "Create a table which exists of the country, year, title of the educational policy change and a description of this change. If the year is unknown leave it blank. Do not limit the number of columns in the output",
    "tone": "Use a neutral, historical, and informative tone to describe educational developments.",
    "format": "A table or Csv file including rows of country, year, educational change and a brief description of this educational change. Do not limit the number of columns in the output, include all available entries if possible."
}

In [None]:
#collection of subtasks given to the model which lead combined with a query on this topic lead to answers in a part of educational policy
subtask = "in the Netherlands"
subtask_primary = "on primary education in the Netherlands"
subtask_secondary = "on secondary education, like the vmbo, hbo and vwo in the Netherlands"
subtask_higher = "on higher education, like hogescholen and univerities in the Netherlands"
subtask_vocational = "on vocational education like the mbo and adult education in the Netherlands"
subtask_system = "on the structure of education, curriculum and the role of regulatory bodies in the Netherlands"
subtask_programs = "on educational programs in the netherlands"
subtask_teacher = "on teachers, teacher programs, teachers education and their working conditions in the Netherlands"
subtask_loans = "on student loans and student grants in the Netherlands"
subtask_disadvantage = "on programs for disadvantaged students and special education in the Netherlands"
subtask_religion = "on religious education in the Netherlands"
subtask_exams = "on the exams and language and arithmetic tests in education in the Netherlands."
subtask_finance = "on school finances, including salaries and budgets in the Netherlands"

In [None]:
#fills the input of the subquery with the subtask, change both subtasks to the searched for part of educational policy
inputs_csv["persona"] = inputs_csv["persona"].format(subtask=subtask)
inputs_csv["task"] = inputs_csv["task"].format(subtask=subtask)

In [None]:
#Run the filled out prompt in the llm given (gpt-4o)
rag_chain = LLMChain(llm=llm, prompt=prompt)
answer = rag_chain.run(inputs_csv)
print(answer)