In [1]:
import ollama
import json
import lancedb
from lancedb.rerankers import ColbertReranker
from tqdm.notebook import tqdm
from pydantic import BaseModel, Field
from typing import Literal
from devtools import debug
from utils.metrics_calculate import calculate_advanced_retrieval_metrics
import instructor

METRICS_FIGURE_NAME = "Contextual retrieval performance metrics (document slice only)"


OWN_TABLE_NAME = "my_anthropic_sliding_table"
BASELINE_TABLE_NAME = "anthropic_table"
QNA_FILE_PATH = "q_and_a/Gemini/scientific_multi_chunk.json"
MODEL_TAG = "qwen3-vl:8b-instruct-q4_K_M"

client = instructor.from_provider(
	f"ollama/{MODEL_TAG}",
	base_url="http://localhost:11434/v1",
	mode=instructor.Mode.JSON,
)

class RelevanceEvaluation(BaseModel):
	chain_of_thought: str = Field(
		..., 
		description="A brief reasoning step explaining why the score was given."
	)
	score: Literal[0,1,2,3] = Field(
		..., 
		description="The relevance score (0, 1, 2, or 3) based on the grading rubric."
	)

def grade_chunk_relevance(question: str, chunk_text: str, model_name: str) -> RelevanceEvaluation:
	"""
	Uses Qwen to grade a single chunk against a question.
	Returns the integer score (0-3).
	"""
	
	# Precise rubric for the system prompt
	system_prompt = """
	You are an impartial expert judge evaluating retrieval quality for a RAG system.
	Evaluate the relevance of the PASSAGE to the QUESTION using this strict scale:
	
	0: Irrelevant. The passage is on a different topic or does not help.
	1: Tangential. Mentions related entities but does not elaborate or provides explicit answer to the question.
	2: Relevant/Partial. Provides useful context or a partial answer.
	3: Highly Relevant. Contains the direct answer or core evidence required.
	"""

	try:
		resp = client.create(
			model=model_name,
			response_model=RelevanceEvaluation,
			messages=[
				{"role": "system", "content": system_prompt},
				{"role": "user", "content": f"QUESTION: {question}\nPASSAGE: {chunk_text}"}
			],
			temperature=0
		)
		return resp
	except Exception as e:
		print(f"Error grading chunk: {e}")
		return 0 # Fail-safe: assume irrelevant if model crashes

             1_minus_recall_at_4  context_precision_ratio  mrr_at_4  ndcg_at_4
question_id                                                                   
1                       0.333333                      1.0       1.0   0.703918
2                       1.000000                      0.0       0.0   0.000000


In [2]:
db = lancedb.connect("./db")
own_table = db.open_table(OWN_TABLE_NAME)
baseline_table = db.open_table(BASELINE_TABLE_NAME)
reranker = ColbertReranker()

with open(QNA_FILE_PATH, "r") as f:
	qna = json.load(f)

Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device cuda
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting


In [3]:
qna[0]['question']

'How do the fundamental goals of open science conflict with the specific characteristics of empirical software engineering research involving industrial contexts?'

In [4]:
own_recalls_per_question = []
baseline_recalls_per_question = []

own_avg_recall_at_n = {}
baseline_avg_recall_at_n = {}

for question in tqdm(qna, desc="Processing questions..."):
	question_prompt = question["question"]
	supporting_chunk_ids = set(question["supporting_chunks"])

	own_df = own_table.search(question_prompt, query_type="hybrid", vector_column_name="vector", fts_columns="text") \
			.rerank(reranker=reranker) \
			.limit(20) \
			.to_pandas()
	
	baseline_df = baseline_table.search(question_prompt, query_type="hybrid", vector_column_name="vector", fts_columns="text") \
			.rerank(reranker=reranker) \
			.limit(20) \
			.to_pandas()

	own_recalls = {}
	baseline_recalls = {}

	for i in range(5,21,5):
		own_df_cutoff = own_df.head(i)
		baseline_df_cutoff = baseline_df.head(i)

		own_retreived_chunk_ids = own_df_cutoff['id'].tolist()
		own_recall_at_i = sum([1 if id in supporting_chunk_ids else 0 for id in own_retreived_chunk_ids]) / len(supporting_chunk_ids)

		baseline_retreived_chunk_ids = baseline_df_cutoff['id'].tolist()
		baseline_recall_at_i = sum([1 if id in supporting_chunk_ids else 0 for id in baseline_retreived_chunk_ids]) / len(supporting_chunk_ids)

		own_recalls[i] = own_recall_at_i
		baseline_recalls[i] = baseline_recall_at_i

	own_recalls_per_question.append(own_recalls)
	baseline_recalls_per_question.append(baseline_recalls)

Processing questions...:   0%|          | 0/270 [00:00<?, ?it/s]

<All keys matched successfully>
<All keys matched successfully>


In [6]:
import numpy as np

for i in range(5,21,5):
    own_avg_recall_at_n[i] = np.average([recall[i] for recall in own_recalls_per_question])
    baseline_avg_recall_at_n[i] = np.average([recall[i] for recall in baseline_recalls_per_question])

    print(f"Own results {own_avg_recall_at_n[i]}\nBaseline: {baseline_avg_recall_at_n[i]}")



Own results 0.7743827160493827
Baseline: 0.43364197530864196
Own results 0.8975308641975309
Baseline: 0.4981481481481482
Own results 0.9324074074074075
Baseline: 0.5160493827160494
Own results 0.9561728395061727
Baseline: 0.5225308641975309


In [None]:
import matplotlib.pyplot as plt
from pydantic import BaseModel, ValidationError, Field, model_validator 
from typing import List, Self

class BarChartData(BaseModel):
    """
    Schema for validating bar chart data.
    Ensures data consistency before visualization.
    """
    labels: List[str] = Field(..., description="Names of the bars")
    values: List[float] = Field(..., description="Numerical values for the bars")
    title: str = Field(..., description="Title of the chart")

    @model_validator(mode="after")
    def check_length_match(self) -> Self:
        if len(self.labels) != len(self.values):
            raise ValidationError(f"There is different number of labels and values!\n{self.labels=}\n{self.values=}")
        return self
    

def create_annotated_bar_chart(data: BarChartData, output_file: str = 'annotated_bar_chart.png'):
    """
    Generates a bar chart with explicit value annotations above each bar.
    """
    # Create the figure and axis explicitly for better control
    fig, ax = plt.subplots()
    
    # Capture the container of bars to access their properties later
    bars = ax.bar(data.labels, data.values, color=['#1f77b4', '#ff7f0e'])
    
    ax.bar_label(bars, padding=3)

    ax.set_xlabel('Categories')
    ax.set_ylabel('Values')
    ax.set_title(data.title)
    
    # Dynamic Y-Axis Adjustment
    # Crucial: Increase the y-axis limit by 10% to prevent the text from being cut off at the top
    ax.set_ylim(0, max(data.values) * 1.1)
    
    plt.savefig(output_file, dpi=300)
    plt.close() # Always close the plot to free memory in batch processing

In [26]:
data = BarChartData(labels=["Anthropic Baseline", "Document slices"], values=[round(baseline_avg_recall_at_n[5],5), round(own_avg_recall_at_n[5],5)], title="Recall@5")

create_annotated_bar_chart(data=data, output_file="benchmarking_results/retrieval/recall_5")

In [31]:
import math

def create_multi_bar_chart(data_list: List[BarChartData], output_file: str = 'multi_plot.png'):
    """
    Generates a figure containing subplots for each BarChartData instance.
    Dynamically calculates grid dimensions.
    """
    n = len(data_list)
    if n == 0:
        raise ValueError("Input list is empty.")

    # 1. Calculate Grid Dimensions
    # We aim for a roughly square grid, prioritizing width (max 3 columns)
    cols = min(n, 3)
    rows = math.ceil(n / cols)
    
    # Calculate figure size: 6 inches per col, 5 inches per row
    figsize = (6 * cols, 5 * rows)

    # 2. Create Subplots
    # squeeze=False ensures axes is always a 2D array, simplifying indexing
    fig, axes = plt.subplots(rows, cols, figsize=figsize, squeeze=False)
    
    # Flatten axes array for easy 1D iteration
    axes_flat = axes.flatten()

    # 3. Plotting Loop
    for i, data in enumerate(data_list):
        ax = axes_flat[i]
        
        # Original Plotting Logic applied to specific axis
        bars = ax.bar(data.labels, data.values, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
        
        # Native matplotlib annotation (requires matplotlib >= 3.4.0)
        ax.bar_label(bars, padding=3, fmt='%.5f')

        ax.set_xlabel('Categories')
        ax.set_ylabel('Values')
        ax.set_title(data.title)
        
        # Dynamic Y-Axis Adjustment for headroom
        if data.values:
            ax.set_ylim(0, max(data.values) * 1.15)

    # 4. Cleanup Unused Axes
    # If we have a 2x2 grid (4 slots) but only 3 data items, turn off the 4th slot
    for j in range(n, len(axes_flat)):
        axes_flat[j].axis('off')

    # Adjust layout to prevent overlapping titles/labels
    plt.tight_layout()
    
    plt.savefig(output_file, dpi=300)
    plt.close()

# --- Execution Example ---
try:
    dataset = []
    for i in range(5,21,5):
        dataset.append(BarChartData(labels=["Anthropic Baseline", "Document slices"], values=[baseline_avg_recall_at_n[i], own_avg_recall_at_n[i]], title=f"Recall@{i}"))
    
    create_multi_bar_chart(dataset, 'dashboard_view.png')
    print("Multi-plot generated successfully.")
except Exception as e:
    print(f"Error: {e}")

Multi-plot generated successfully.


In [30]:
round(baseline_avg_recall_at_n[5],5)

np.float64(0.43364)

# LLM as a judge (TODO)

In [None]:
# for question in tqdm(qna[0], desc="Processing questions..."):
	# question_prompt = qna[0]["question"]

	# df = table.search(question_prompt, query_type="hybrid", vector_column_name="vector", fts_columns="text") \
	#             .rerank(reranker=reranker) \
	#             .limit(10) \
	#             .to_pandas()

	# for idx, row in tqdm(df.iterrows(), total=len(df), desc="Grading Chunks"):
	#     resp = grade_chunk_relevance(question_prompt, row['text'], MODEL_TAG)
	#     df.at[idx, 'llm_grade'] = resp.score
	#     df.at[idx, 'reasoning'] = resp.chain_of_thought

	# df[['text', 'llm_grade', 'reasoning']]
		

Grading Chunks:   0%|          | 0/10 [00:00<?, ?it/s]

                                                text  llm_grade  \
0  The text outlines a conceptual framework and r...        3.0   
1  The text outlines a project to develop methodo...        3.0   
2  This chunk outlines the core concepts of open ...        2.0   
3  This section outlines the authorsâ€™ work: a con...        3.0   
4  Provides a framework for balancing open scienc...        2.0   
5  This chunk details the categorization of empir...        2.0   
6  Identifies the Gander project as a case study ...        2.0   
7  The analysis focuses on a framework for unders...        2.0   
8  Provides supporting evidence for the argument ...        0.0   
9  This chunk highlights the contrasting approach...        2.0   

                                           reasoning  
0  The passage directly addresses the conflict be...  
1  The passage directly addresses the conflict be...  
2  The passage acknowledges the tension between o...  
3  The passage directly addresses the co