In [1]:
from rag_objects.RAG_system_A import RAGSystemA
from rag_objects.RAG_system_B import RAGSystemB
from rag_objects.RAG_system_C import RAGSystemC
import pandas as pd
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
torch.cuda.is_available()
#print(torch.version.cuda)  
#"i'm sorry, but none of the provided locations are palaces in the hague.  the query is too specific for this dataset.

True

# Testing pipeline
## Test definitions

In [3]:

test_cases = [
    {"user_query": "skyscrapers in the Hague", "golden_route": ["Hoftoren", "Het Strijkijzer", "De Kroon (woontoren)","Castalia (gebouw)","Zurichtoren"]},
    {"user_query": "What should a museum nerd see in Haarlem", "golden_route": ["Frans Hals Museum", "Verwey Museum Haarlem", "Teylers Museum","Het Dolhuys", "Archeologisch Museum Haarlem"]},
    {"user_query": "I like metro systems. Which stations are interesting in Amsterdam?", "golden_route": ["Nieuwmarkt (metrostation)", "Rokin (metrostation)", "Vijzelgracht (metrostation)","Weesperplein (metrostation)", "Noorderpark (metrostation)"]},
    {"user_query": "brutalist architecture Amsterdam", "golden_route": ["Hoofdgebouw Vrije Universiteit", "Leeuwenburg (Amsterdam)", "Louwesweg","Kraanspoor", "Weteringschans 26-28"]},
    {"user_query": "I want to see the most famous bridges of Amsterdam", "golden_route": ["Oudekerksbrug", "Blauwbrug", "Aluminiumbrug","Torensluis", "Sint Antoniessluishoogwaterkering"]},
    {"user_query": "the palaces of the Hague", "golden_route": ["Paleis Kneuterdijk", "Paleis Noordeinde", "Mauritshuis","Paleis Huis ten Bosch", "Vredespaleis"]},
    {"user_query": "What to see in Amsterdam to learn about the jewish heritage", "golden_route": ["Jodenbuurt (Amsterdam)", "Anne Frank Huis", "Nationaal Holocaustmuseum","Holocaust Namenmonument", "Portugees-Israëlietische Synagoge"]},
    {"user_query": "What should Rembrandt lover see in Leiden?", "golden_route": ["Latijnse school (Leiden)", "Rembrandtbrug", "Pieterskerk (Leiden)","Langebrug (Leiden)", "Museum De Lakenhal"]}
]

# test_cases = [
#     {"user_query": "skyscrapers in the Hague", "golden_route": ["Hoftoren", "Het Strijkijzer", "De Kroon (woontoren)","Castalia (gebouw)","Zurichtoren"]},
#     {"user_query": "What should a museum nerd see in Haarlem", "golden_route": ["Frans Hals Museum", "Verwey Museum Haarlem", "Teylers Museum","Het Dolhuys", "Archeologisch Museum Haarlem"]}
# ]

embedding_models = {
    "sentence-transformers/all-MiniLM-L6-v2": "embeddings/all-MiniLM-L6-v2_faiss_index.index",
    "sentence-transformers/all-mpnet-base-v2": "embeddings/all-mpnet-base-v2_faiss_index.index",
    "NovaSearch/stella_en_1.5B_v5" : "embeddings/stella_en_1_5B_v5_embeddings_faiss_index.index"
}

gemini_token_file = "API_tokens/gemini.txt"

def evaluate_rag_system(rag_system, test_cases, rag_system_name, embedding_model_name, top_k=8):
    """
    Evaluate a RAG system by running multiple test cases and computing retrieval metrics.

    Args:
        rag_system (RAGSystemA): The RAG system to test.
        test_cases (list of dict): Each dict should contain:
            - 'user_query': The query string.
            - 'golden_route': List of ideal location titles.
        rag_system_name (str): Name of the RAG system being tested.
        embedding_model_name (str): Name of the embedding model used.

    Returns:
        pd.DataFrame: DataFrame containing results for all test cases.
    """
    results = []
    
    for test_case in test_cases:
        user_query = test_case["user_query"]
        golden_route = test_case["golden_route"]
        
        # Run query
        result = rag_system.query(user_query=user_query, top_k=top_k)
        
        # Extract retrieved titles
        retrieved_titles = [loc["title"] for loc in result.get("locations", [])]
        
        # Compute golden route metrics
        retrieved_set = set(retrieved_titles)
        golden_set = set(golden_route)
        common = retrieved_set.intersection(golden_set)
        precision = len(common) / len(retrieved_titles) if retrieved_titles else 0
        recall = len(common) / len(golden_set) if golden_set else 0
        num_retrieved_from_golden = len(common)
        
        # Store results dynamically including all dictionary keys
        result_entry = {
            "rag_system": rag_system_name,
            "embedding_model": embedding_model_name,
            "query": user_query,
            "retrieved_titles": retrieved_titles,
            "golden_route": golden_route,
            "precision": precision,
            "recall": recall,
            "num_retrieved_from_golden": num_retrieved_from_golden,
        }
        
        # Add all key-value pairs dynamically
        for key, value in result.items():
            if key not in result_entry:
                result_entry[key] = value
        
        results.append(result_entry)
    
    return pd.DataFrame(results)

## Running tests
### RAG A

In [6]:
all_results = []

for model_name, index_path in embedding_models.items():
    print(f"Evaluating with embedding model: {model_name}")
    
    rag_system = RAGSystemA(index_path=index_path, embedding_model_name=model_name, gemini_token_file=gemini_token_file)
    
    # Run evaluation and collect results
    df_results = evaluate_rag_system(rag_system, test_cases, "RAGSystemA", model_name, top_k=8)
    all_results.append(df_results)

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_results, ignore_index=True)
final_results_df.to_csv("test_results/A_rag_results.csv", index=False)

print("Evaluation completed. Results saved to 'test_results/A_rag_results.csv'")

Evaluating with embedding model: sentence-transformers/all-MiniLM-L6-v2
RAGSystemA initialized and ready.
Evaluating with embedding model: sentence-transformers/all-mpnet-base-v2
RAGSystemA initialized and ready.
Evaluating with embedding model: NovaSearch/stella_en_1.5B_v5
RAGSystemA initialized and ready.
Evaluation completed. Results saved to 'test_results/A_rag_results.csv'


### RAG B

In [4]:
all_results = []

for model_name, index_path in embedding_models.items():
    print(f"Evaluating with embedding model: {model_name}")
    
    rag_system = RAGSystemB(index_path=index_path, embedding_model_name=model_name, gemini_token_file=gemini_token_file)
    
    # Run evaluation and collect results
    df_results = evaluate_rag_system(rag_system, test_cases, "RAGSystemB", model_name,top_k=16)
    all_results.append(df_results)

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_results, ignore_index=True)
final_results_df.to_csv("test_results/B_rag_results.csv", index=False)

print("Evaluation completed. Results saved to 'test_results/B_rag_results.csv'")

Evaluating with embedding model: sentence-transformers/all-MiniLM-L6-v2
RAGSystemB initialized and ready.
Evaluating with embedding model: sentence-transformers/all-mpnet-base-v2
RAGSystemB initialized and ready.
Evaluating with embedding model: NovaSearch/stella_en_1.5B_v5
RAGSystemB initialized and ready.
Evaluation completed. Results saved to 'test_results/B_rag_results.csv'


### RAG C

In [5]:
all_results = []

for model_name, index_path in embedding_models.items():
    print(f"Evaluating with embedding model: {model_name}")
    
    rag_system = RAGSystemC(index_path=index_path, embedding_model_name=model_name, gemini_token_file=gemini_token_file)
    
    # Run evaluation and collect results
    df_results = evaluate_rag_system(rag_system, test_cases, "RAGSystemC", model_name,top_k=8)
    all_results.append(df_results)

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_results, ignore_index=True)
final_results_df.to_csv("test_results/C_rag_results.csv", index=False)

print("Evaluation completed. Results saved to 'test_results/C_rag_results.csv'")

Evaluating with embedding model: sentence-transformers/all-MiniLM-L6-v2
RAGSystemC initialized and ready.
Evaluating with embedding model: sentence-transformers/all-mpnet-base-v2
RAGSystemC initialized and ready.
Evaluating with embedding model: NovaSearch/stella_en_1.5B_v5
RAGSystemC initialized and ready.
Evaluation completed. Results saved to 'test_results/C_rag_results.csv'


## Marking answers

In [3]:
import pandas as pd
import folium
import ipywidgets as widgets
from IPython.display import display, clear_output
import numpy as np

In [18]:
# Load results CSV
csv_path = "test_results/B_rag_results.csv"  # Change this if needed
df = pd.read_csv(csv_path)

# Initialize rating columns
df["irrelevant_wrong_area"] = [[] for _ in range(len(df))]
df["relevant_wrong_area"] = [[] for _ in range(len(df))]
df["irrelevant"] = [[] for _ in range(len(df))]
df["relevant"] = [[] for _ in range(len(df))]
df["good"] = [[] for _ in range(len(df))]

# Initialize count columns
df["count_irrelevant_wrong_area"] = 0
df["count_relevant_wrong_area"] = 0
df["count_irrelevant"] = 0
df["count_relevant"] = 0
df["count_good"] = 0

def display_map(latitude, longitude, title):
    """ Display a full-sized Folium map centered at the given coordinates. """
    m = folium.Map(location=[latitude, longitude], zoom_start=12, control_scale=True, height=400)
    folium.Marker([latitude, longitude], popup=title, tooltip=title).add_to(m)
    display(m)

def rate_location(row_idx, locations):
    """ Manually input a rating for each retrieved location. """
    ratings = {
        1: "Irrelevant, Wrong Area",
        2: "Relevant, Wrong Area",
        3: "Irrelevant",
        4: "Relevant",
        5: "Good"
    }
    
    location_ratings = {1: [], 2: [], 3: [], 4: [], 5: []}
    
    for location in locations:
        clear_output(wait=True)
        title = location["title"]
        description = location["generated_text"]
        latitude = location["latitude"]
        longitude = location["longitude"]

        print(f"User Query: {df.loc[row_idx, 'query']}")
        print(f"\nLocation: {title}")
        print(f"Description: {description[:1500]}...\n")

        # Print rating options before displaying the map
        print("\nRating Guide:")
        for key, value in ratings.items():
            print(f"{key}: {value}")

        display_map(latitude, longitude, title)

        while True:
            try:
                rating = int(input(f"Enter rating (1-5) for '{title}': ").strip())
                if rating in ratings:
                    location_ratings[rating].append(title)
                    break
                else:
                    print("Invalid input. Please enter a number between 1 and 5.")
            except ValueError:
                print("Invalid input. Please enter a valid number between 1 and 5.")

    return location_ratings

def manual_rating_pipeline():
    """ Run through all rows and manually rate retrieved locations. """
    for row_idx in range(len(df)):
        locations = eval(df.loc[row_idx, "locations"])  # Convert string to list
        ratings = rate_location(row_idx, locations)

        df.at[row_idx, "irrelevant_wrong_area"] = ratings[1]
        df.at[row_idx, "relevant_wrong_area"] = ratings[2]
        df.at[row_idx, "irrelevant"] = ratings[3]
        df.at[row_idx, "relevant"] = ratings[4]
        df.at[row_idx, "good"] = ratings[5]

        # Store only the counts per category
        df.at[row_idx, "count_irrelevant_wrong_area"] = len(ratings[1])
        df.at[row_idx, "count_relevant_wrong_area"] = len(ratings[2])
        df.at[row_idx, "count_irrelevant"] = len(ratings[3])
        df.at[row_idx, "count_relevant"] = len(ratings[4])
        df.at[row_idx, "count_good"] = len(ratings[5])

        clear_output()
        print("Summary of Ratings for Current Query:")
        print(f"User Query: {df.at[row_idx, 'query']}")

        print(f"Irrelevant, Wrong Area: {df.at[row_idx, 'count_irrelevant_wrong_area']}")
        print(f"Relevant, Wrong Area: {df.at[row_idx, 'count_relevant_wrong_area']}")
        print(f"Irrelevant: {df.at[row_idx, 'count_irrelevant']}")
        print(f"Relevant: {df.at[row_idx, 'count_relevant']}")
        print(f"Good: {df.at[row_idx, 'count_good']}")

        df.to_csv("test_results/B_rated_rag_results.csv", index=False)          #CHANGE
        input("Press Enter to proceed to the next query...")

    df.to_csv("test_results/B_rated_rag_results.csv", index=False)              #CHANGE
    print("All ratings saved to 'test_results/B_rated_rag_results.csv'.")       #CHANGE
    return df

In [19]:
rated_df = manual_rating_pipeline()

Summary of Ratings for Current Query:
User Query: What should Rembrandt lover see in Leiden?
Irrelevant, Wrong Area: 0
Relevant, Wrong Area: 0
Irrelevant: 0
Relevant: 0
Good: 0
All ratings saved to 'test_results/B_rated_rag_results.csv'.
