In [1]:
from rag_objects.RAG_system_A import RAGSystemA
from rag_objects.RAG_system_B import RAGSystemB
from rag_objects.RAG_system_C import RAGSystemC
import pandas as pd
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
torch.cuda.is_available()
#print(torch.version.cuda)  
#"i'm sorry, but none of the provided locations are palaces in the hague.  the query is too specific for this dataset.

True

# Testing pipeline
## Test definitions

In [3]:

test_cases = [
    {"user_query": "skyscrapers in the Hague", "golden_route": ["Hoftoren", "Het Strijkijzer", "De Kroon (woontoren)","Castalia (gebouw)","Zurichtoren"]},
    {"user_query": "What should a museum nerd see in Haarlem", "golden_route": ["Frans Hals Museum", "Verwey Museum Haarlem", "Teylers Museum","Het Dolhuys", "Archeologisch Museum Haarlem"]},
    {"user_query": "I like metro systems. Which stations are interesting in Amsterdam?", "golden_route": ["Nieuwmarkt (metrostation)", "Rokin (metrostation)", "Vijzelgracht (metrostation)","Weesperplein (metrostation)", "Noorderpark (metrostation)"]},
    {"user_query": "brutalist architecture Amsterdam", "golden_route": ["Hoofdgebouw Vrije Universiteit", "Leeuwenburg (Amsterdam)", "Louwesweg","Kraanspoor", "Weteringschans 26-28"]},
    {"user_query": "I want to see the most famous bridges of Amsterdam", "golden_route": ["Oudekerksbrug", "Blauwbrug", "Aluminiumbrug","Torensluis", "Sint Antoniessluishoogwaterkering"]},
    {"user_query": "the palaces of the Hague", "golden_route": ["Paleis Kneuterdijk", "Paleis Noordeinde", "Mauritshuis","Paleis Huis ten Bosch", "Vredespaleis"]},
    {"user_query": "What to see in Amsterdam to learn about the jewish heritage", "golden_route": ["Jodenbuurt (Amsterdam)", "Anne Frank Huis", "Nationaal Holocaustmuseum","Holocaust Namenmonument", "Portugees-Israëlietische Synagoge"]},
    {"user_query": "What should Rembrandt lover see in Leiden?", "golden_route": ["Latijnse school (Leiden)", "Rembrandtbrug", "Pieterskerk (Leiden)","Langebrug (Leiden)", "Museum De Lakenhal"]}
]

# test_cases = [
#     {"user_query": "skyscrapers in the Hague", "golden_route": ["Hoftoren", "Het Strijkijzer", "De Kroon (woontoren)","Castalia (gebouw)","Zurichtoren"]},
#     {"user_query": "What should a museum nerd see in Haarlem", "golden_route": ["Frans Hals Museum", "Verwey Museum Haarlem", "Teylers Museum","Het Dolhuys", "Archeologisch Museum Haarlem"]}
# ]

embedding_models = {
    "sentence-transformers/all-MiniLM-L6-v2": "embeddings/all-MiniLM-L6-v2_faiss_index.index",
    "sentence-transformers/all-mpnet-base-v2": "embeddings/all-mpnet-base-v2_faiss_index.index",
    "NovaSearch/stella_en_1.5B_v5" : "embeddings/stella_en_1_5B_v5_embeddings_faiss_index.index"
}

gemini_token_file = "API_tokens/gemini.txt"

def evaluate_rag_system(rag_system, test_cases, rag_system_name, embedding_model_name, top_k=8):
    """
    Evaluate a RAG system by running multiple test cases and computing retrieval metrics.

    Args:
        rag_system (RAGSystemA): The RAG system to test.
        test_cases (list of dict): Each dict should contain:
            - 'user_query': The query string.
            - 'golden_route': List of ideal location titles.
        rag_system_name (str): Name of the RAG system being tested.
        embedding_model_name (str): Name of the embedding model used.

    Returns:
        pd.DataFrame: DataFrame containing results for all test cases.
    """
    results = []
    
    for test_case in test_cases:
        user_query = test_case["user_query"]
        golden_route = test_case["golden_route"]
        
        # Run query
        result = rag_system.query(user_query=user_query, top_k=top_k)
        
        # Extract retrieved titles
        retrieved_titles = [loc["title"] for loc in result.get("locations", [])]
        
        # Compute golden route metrics
        retrieved_set = set(retrieved_titles)
        golden_set = set(golden_route)
        common = retrieved_set.intersection(golden_set)
        precision = len(common) / len(retrieved_titles) if retrieved_titles else 0
        recall = len(common) / len(golden_set) if golden_set else 0
        num_retrieved_from_golden = len(common)
        
        # Store results dynamically including all dictionary keys
        result_entry = {
            "rag_system": rag_system_name,
            "embedding_model": embedding_model_name,
            "query": user_query,
            "retrieved_titles": retrieved_titles,
            "golden_route": golden_route,
            "precision": precision,
            "recall": recall,
            "num_retrieved_from_golden": num_retrieved_from_golden,
        }
        
        # Add all key-value pairs dynamically
        for key, value in result.items():
            if key not in result_entry:
                result_entry[key] = value
        
        results.append(result_entry)
    
    return pd.DataFrame(results)

## Running tests
### RAG A

In [6]:
all_results = []

for model_name, index_path in embedding_models.items():
    print(f"Evaluating with embedding model: {model_name}")
    
    rag_system = RAGSystemA(index_path=index_path, embedding_model_name=model_name, gemini_token_file=gemini_token_file)
    
    # Run evaluation and collect results
    df_results = evaluate_rag_system(rag_system, test_cases, "RAGSystemA", model_name, top_k=8)
    all_results.append(df_results)

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_results, ignore_index=True)
final_results_df.to_csv("test_results/A_rag_results.csv", index=False)

print("Evaluation completed. Results saved to 'test_results/A_rag_results.csv'")

Evaluating with embedding model: sentence-transformers/all-MiniLM-L6-v2
RAGSystemA initialized and ready.
Evaluating with embedding model: sentence-transformers/all-mpnet-base-v2
RAGSystemA initialized and ready.
Evaluating with embedding model: NovaSearch/stella_en_1.5B_v5
RAGSystemA initialized and ready.
Evaluation completed. Results saved to 'test_results/A_rag_results.csv'


### RAG B

In [4]:
all_results = []

for model_name, index_path in embedding_models.items():
    print(f"Evaluating with embedding model: {model_name}")
    
    rag_system = RAGSystemB(index_path=index_path, embedding_model_name=model_name, gemini_token_file=gemini_token_file)
    
    # Run evaluation and collect results
    df_results = evaluate_rag_system(rag_system, test_cases, "RAGSystemB", model_name,top_k=16)
    all_results.append(df_results)

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_results, ignore_index=True)
final_results_df.to_csv("test_results/B_rag_results.csv", index=False)

print("Evaluation completed. Results saved to 'test_results/B_rag_results.csv'")

Evaluating with embedding model: sentence-transformers/all-MiniLM-L6-v2
RAGSystemB initialized and ready.
Evaluating with embedding model: sentence-transformers/all-mpnet-base-v2
RAGSystemB initialized and ready.
Evaluating with embedding model: NovaSearch/stella_en_1.5B_v5
RAGSystemB initialized and ready.
Evaluation completed. Results saved to 'test_results/B_rag_results.csv'


### RAG C

In [5]:
all_results = []

for model_name, index_path in embedding_models.items():
    print(f"Evaluating with embedding model: {model_name}")
    
    rag_system = RAGSystemC(index_path=index_path, embedding_model_name=model_name, gemini_token_file=gemini_token_file)
    
    # Run evaluation and collect results
    df_results = evaluate_rag_system(rag_system, test_cases, "RAGSystemC", model_name,top_k=8)
    all_results.append(df_results)

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_results, ignore_index=True)
final_results_df.to_csv("test_results/C_rag_results.csv", index=False)

print("Evaluation completed. Results saved to 'test_results/C_rag_results.csv'")

Evaluating with embedding model: sentence-transformers/all-MiniLM-L6-v2
RAGSystemC initialized and ready.
Evaluating with embedding model: sentence-transformers/all-mpnet-base-v2
RAGSystemC initialized and ready.
Evaluating with embedding model: NovaSearch/stella_en_1.5B_v5
RAGSystemC initialized and ready.
Evaluation completed. Results saved to 'test_results/C_rag_results.csv'
