In [None]:
import torch
import os

from sentence_transformers import CrossEncoder
from src.chain_initialisation import init_chain
from src.embedding_management import init_embeddings, add_embeddings_from_files
from src.model_initialisation import init_model
from src.pipeline_initialisation import init_pipeline
from src.retriever_initialisation import init_retriever
from src.dynamic_doc_retrieval import download_documents, initialise_keyword_model, generate_query_from_question

In [None]:
cuda_available = torch.cuda.is_available()
print(f"Initializing model... CUDA available: {cuda_available}")
model, model_name = init_model(cuda_available)
pipeline = init_pipeline(model=model, model_name=model_name)
vectorstore = init_embeddings(cuda_available)

In [3]:
BASE_DIR = os.curdir
DATA_DIR = os.path.join(BASE_DIR, "data")
DYNAMIC_DATA_DIR = os.path.join(DATA_DIR, "dynamic")
print(f"dynamic data dir: {DYNAMIC_DATA_DIR}")

def ask_question(question, online=True, top_k=3, min_score=0.5):
    query = ""
    downloaded_files = []
    if online:
        print("Obtaining data from scholar...")
        query = generate_query_from_question(keyword_model, question)
        num_docs: int = int(os.getenv("num_docs"))
        max_tries: int = int(os.getenv("max_tries"))
        downloaded_files = download_documents(query, DYNAMIC_DATA_DIR, num_docs, max_tries)
        add_embeddings_from_files(vectorstore, downloaded_files)

    response = chain.invoke({
        "question": question,
        "chat_history": []
    })
    answer = response["answer"].split("### Answer:")[-1].strip()
    source_documents = response["source_documents"]
    filtered_docs = rerank_documents(question, source_documents, top_k=top_k, min_score=min_score)
    return {
        "question": question,
        "answer": answer,
        "source_documents": source_documents,
        "filtered_docs": filtered_docs,
        "query": query,
        "downloaded_files": downloaded_files,
        "online": online,
        "rerank_top_k": top_k,
        "min_rerank_score": min_score
    }


def rerank_documents(query: str, documents: list, top_k: int = 3, min_score = 0.5) -> list:
    if not documents:
        return []

    pairs = [(query, doc.page_content) for doc in documents]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)

    return [doc for doc, score in ranked[:top_k] if score > min_score]

dynamic data dir: .\data\dynamic


In [None]:
import json

results_file = "results.jsonl"

configs = [
    {"k": 3, "th": 0.7},
    {"k": 3, "th": 0.9},
    {"k": 5, "th": 0.7},
    {"k": 5, "th": 0.85},
    {"k": 10, "th": 0.7},
    {"k": 10, "th": 0.9},
]

rerank_configs = [
    {"top_k": 2, "min_score": 0.3},
    {"top_k": 3, "min_score": 0.5},
    {"top_k": 5, "min_score": 0.6},
]

questions = [
    "What are the long-term effects of microplastic ingestion in marine life?",
    "How does transformer architecture differ from traditional RNNs in NLP tasks?",
    "What are the advantages of using vector databases for semantic search?",
    "How can FAISS be optimized for large-scale document retrieval?",
    "Tell me how retroviruses are able to cross species barriers and known cases of such situations",
    "What are the latest advancements in quantum computing and their potential applications?",
    "What are the risks and benefits of intermittent fasting in elderly patients with type 2 diabetes and mild cognitive impairment?",
    "What are the mechanisms by which chronic psychological stress increases the risk of cardiovascular disease?",
    "How does gut microbiota influence the efficacy of checkpoint inhibitor therapy in metastatic melanoma?",
    "What are the potential applications of CRISPR technology in treating genetic disorders?",
    "How did Alan Turing's 1936 paper on computable numbers influence the development of modern algorithms?",
    "What is the historical significance of the Treaty of Tordesillas in shaping modern colonial boundaries?",
    "How did the discovery of the double helix structure of DNA in 1953 by Watson and Crick revolutionize our understanding of genetics?",
    "What are the key differences between classical and quantum cryptography, and how do they impact data security?",
    "What are the implications of the discovery of the Higgs boson for our understanding of particle physics?",
    "How does the process of photosynthesis in plants contribute to the carbon cycle and climate regulation?",
    "What are the main limitations of quantum annealing for solving combinatorial optimization problems?",
    "How did the use of concrete evolve in Roman versus Byzantine architecture?",
    "Make comparison of different chips and graphic cards regarding their efficiency and scalability for AI applications on AWS",
    "What are the results for presidential elections in Poland in 2025?",
]

for config in configs:
    retriever = init_retriever(vectorstore, k=config["k"], th=config["th"])
    reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    chain = init_chain(pipeline=pipeline, retriever=retriever)
    keyword_model = initialise_keyword_model()
    print("Start-up complete.")
    for online in [False, True]:
        for q in questions:
            for rerank_conf in rerank_configs:
                result = ask_question(
                    q,
                    online=online,
                    top_k=rerank_conf["top_k"],
                    min_score=rerank_conf["min_score"]
                )

                result.update({
                    "retriever_k": config["k"],
                    "retriever_th": config["th"]
                })

                with open(results_file, "a", encoding="utf-8") as f:
                    json.dump(result, f)
                    f.write("\n")

