In [1]:
# ==========================================
# CELL 1: SETUP & KEY MANAGEMENT
# ==========================================
# !pip install groq faiss-cpu sentence-transformers

import os
import getpass
from groq import Groq
import json
import gc
import pickle
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from rag_evaluation import RAGEvaluator
import pandas as pd

load_dotenv(override=True)

# 1. Securely Input API Key
# If you haven't set it in your environment, this will prompt you.
if "GROQ_API_KEY" not in os.environ:
    print("üîë Enter your Groq API Key (Input will be hidden):")
    os.environ["GROQ_API_KEY"] = getpass.getpass()
# 2. Initialize Groq Client
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)
# print(os.environ["GROQ_API_KEY"])

# 3. Define Model
# Llama 3.3 70B is the current SOTA on Groq for reasoning
LLM_MODEL = "llama-3.3-70b-versatile"
# LLM_MODEL = "openai/gpt-oss-120b"

print(f"‚úÖ Groq Client Configured with model: {LLM_MODEL}")

‚úÖ Groq Client Configured with model: llama-3.3-70b-versatile


In [2]:
# ==========================================
# CELL 2: THE RETRIEVER CLASS (The "Memory")
# ==========================================
class Retriever:
    def __init__(self, model_path="./models"):
        print("‚öôÔ∏è Loading Retrieval Engine...")
        
        # 1. Load Embedding Model (Must match Phase 2!)
        # We use the same BAAI/bge-small model to ensure vectors match
        self.encoder = SentenceTransformer("BAAI/bge-small-en-v1.5")
        
        # 2. Load FAISS Index
        self.index = faiss.read_index(f"{model_path}/faiss_index.bin")
        
        # 3. Load Metadata (The actual text)
        with open(f"{model_path}/chunk_metadata.pkl", "rb") as f:
            self.chunks = pickle.load(f)
            
        print(f"‚úÖ System Ready. Index contains {self.index.ntotal} documents.")

    def search(self, query, k=5):
        """
        Takes a user query, finds the top K most relevant chunks.
        """
        # A. Embed Query
        # Remember: bge-models need the instruction for queries
        query_prompt = f"Represent this sentence for searching relevant passages: {query}"
        query_vec = self.encoder.encode(
            [query_prompt], 
            normalize_embeddings=True, 
            convert_to_numpy=True
        )
        
        # B. Search Index
        # D = Distances (Scores), I = Indices (IDs)
        D, I = self.index.search(query_vec, k)
        
        # C. Retrieve Content
        results = []
        for i in range(k):
            idx = I[0][i]
            score = D[0][i]
            
            # Map ID back to text
            chunk_data = self.chunks[idx]
            
            # Append clean result
            results.append({
                "text": chunk_data['text'],
                "title": chunk_data['title'],
                "score": float(score),
                "url": chunk_data['source_url']
            })
            
        return results

# Initialize the Retriever once
retriever = Retriever()

‚öôÔ∏è Loading Retrieval Engine...
‚úÖ System Ready. Index contains 39141 documents.


In [3]:
# ==========================================
# CELL 3: THE GENERATOR (The "Brain")
# ==========================================
def generate_rag_answer(query, retriever_instance, llm_model="llama-3.3-70b-versatile"):
    """
    The Full RAG Pipeline: Retrieve -> Augment -> Generate
    """
    # 1. RETRIEVE
    # We fetch top 3 chunks. 3 is usually the sweet spot for context window vs focus.
    retrieved_docs = retriever_instance.search(query, k=3)
    
    # 2. AUGMENT (Context Construction)
    # We join the chunks into a single string for the LLM
    context_text = ""
    for i, doc in enumerate(retrieved_docs):
        context_text += f"Source {i+1} ({doc['title']}):\n{doc['text']}\n\n"
    
    # 3. PROMPT ENGINEERING
    # We force the model to be a "Strict Scholar" - only using provided context.
    system_prompt = """You are a helpful, accurate AI assistant. 
    You have access to a specific Knowledge Base. 
    ALWAYS answer the user's question using ONLY the context provided below.
    If the answer is not in the context, strictly state: "I cannot answer this based on the provided documents."
    Do not hallucinate or use outside knowledge.
    """
    
    user_prompt = f"""
    Context Information:
    ---------------------
    {context_text}
    ---------------------
    
    User Question: {query}
    
    Answer:
    """
    
    # 4. GENERATE (Groq API)
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        model=llm_model,
        temperature=0.1, # Low temp = more factual/deterministic
        max_tokens=512,  # Keep answers concise
    )
    
    response_text = chat_completion.choices[0].message.content
    
    return response_text, retrieved_docs

print("‚úÖ RAG Function Defined.")

‚úÖ RAG Function Defined.


In [None]:
# ==========================================
# CELL 4: INTERACTIVE TEST
# ==========================================
# Try asking about something specific in your dataset
query = "When did Beyonce start becoming popular?"
# Or: "Who managed Destiny's Child?" (If Beyonce is in your dataset)

print(f"‚ùì Question: {query}\n")

answer, sources = generate_rag_answer(query, retriever)

print("ü§ñ AI Answer:")
print("-" * 60)
print(answer)
print("-" * 60)

print("\nüìÑ Sources Used:")
for source in sources:
    print(f"   ‚Ä¢ {source['title']} (Score: {source['score']:.4f}): {source['text']}")
    # print(f"     Preview: {source['text'][:100]}...")

In [5]:
# ==========================================
# CELL 2: CONNECT YOUR COMPONENTS
# ==========================================
# Assuming 'retriever' and 'generate_rag_answer' are defined in previous cells

# Initialize the Evaluator
evaluator = RAGEvaluator(
    retriever_instance=retriever,
    generator_func=generate_rag_answer,  # The function from Notebook 03
    groq_client=client
)

## WITHOUT IMPOSSIBLE Questions
only 50 samples can be used as its the api limit
and it contains the judge eval also

In [6]:
### WITHOUT IMPOSSIBLE
with open("./data/raw/squad_eval_set_all.json", "r") as f:
    raw_data = json.load(f)
    
# Filter only validation data
# eval_dataset = raw_data
eval_dataset = [x for x in raw_data if x['is_impossible'] == False]
# eval_dataset = [x for x in raw_data if x['split'] == 'validation']
print(f"üìö Loaded {len(eval_dataset)} validation questions.")

del raw_data
gc.collect()

# ==========================================
# CELL 3: RUN EXPERIMENT 1 (Baseline)
# ==========================================

# Let's run a test with 30 questions
df, summary = evaluator.run_experiment(
    dataset=eval_dataset,
    experiment_name="not_impossible",
    rag_type="naive_rag",
    model_name="llama-3.3-70b-versatile",
    # model_name="openai/gpt-oss-120b",
    sample_size=50,
    target_rpm=15,                     # Slower for safety (Judge adds calls)
    use_llm_judge=False,                # Enable Judge
    # judge_model = "llama-3.3-70b-versatile"
    # judge_model="openai/gpt-oss-20b"
    judge_model="openai/gpt-oss-120b" # Pick your judge!
)

# View the first few rows of the saved data
df.head()

üìö Loaded 92749 validation questions.
‚è±Ô∏è Target RPM: 15 | Sleep Base: 4.80s
Model: llama-3.3-70b-versatile, Judge: openai/gpt-oss-120b
üé≤ Sampling 50 questions...
üöÄ Experiment: not_impossible | Judge: False


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [04:47<00:00,  5.76s/it]


üìä EXPERIMENT SUMMARY
Total                    : 50.0000
Avg F1                   : 0.2880
Avg Exact Match          : 0.1000
Avg Hit Rate             : 0.9000
Avg Faithfulness         : 0.0000
Avg Relevance            : 0.0000
Avg Context Utility      : 0.0000
Avg Coherence            : 0.0000
Avg Semantic Similarity  : 0.0000
Total API Calls          : 50.0000
üíæ Saved: ./data/results/eval_not_impossible_naive_rag_20251221_105432.csv





Unnamed: 0,run_id,experiment_name,rag_type,model_name,judge_model,timestamp,question,is_impossible,generated_answer,gold_answers,...,exact_match,f1_score,judge_faithfulness,judge_relevance,judge_utility,judge_coherence,judge_similarity,judge_neg_rejection,judge_reasoning,retrieved_context_text
0,not_impossible_20251221_105432,not_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251221_105432,What was the name of the scientist who develop...,False,The scientist who developed and published a mo...,['Jean-Baptiste Lamarck'],...,0,0.2105,0.0,0.0,0.0,0.0,0.0,,,Charles Darwin's grandfather Erasmus Darwin ou...
1,not_impossible_20251221_105432,not_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251221_105432,What is the name of the capital city of Myanmar?,False,The capital city of Myanmar is Naypyidaw.,['capital city is Naypyidaw'],...,0,0.8,0.0,0.0,0.0,0.0,0.0,,,"Myanmar, officially the Republic of the Union ..."
2,not_impossible_20251221_105432,not_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251221_105432,What did Tito offer to the retreating column?,False,Tito offered amnesty to the retreating column.,['amnesty'],...,0,0.2857,0.0,0.0,0.0,0.0,0.0,,,Tito's estrangement from the USSR enabled Yugo...
3,not_impossible_20251221_105432,not_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251221_105432,What did these attacks accomplish?,False,I cannot answer this based on the provided doc...,['breaks in morale'],...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,"Additionally, recent attacker motivations can ..."
4,not_impossible_20251221_105432,not_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251221_105432,Who does the Chief of Staff serve as the princ...,False,The Chief of Staff of the Army serves as the p...,['Secretary of the Army'],...,0,0.3,0.0,0.0,0.0,0.0,0.0,,,The army is led by a civilian Secretary of the...


In [None]:
df_not_impossible = pd.read_csv("./data/results/eval_not_impossible_naive_rag_20251221_105432.csv")
df_not_impossible.to_csv("final_results/NI_naive_rag.csv")

(50, 26)


Series([], Name: generated_answer, dtype: object)

In [6]:
csv_path = "final_results/NI_naive_rag.csv"
df = pd.read_csv(csv_path)
print(df.shape)
df[df['generated_answer'].str.contains('Error')]['generated_answer']

start = 40
end = 50
df_batch = df[start:end]
print(df_batch.shape)
df_batch.tail()

(50, 26)
(10, 26)


Unnamed: 0.1,Unnamed: 0,run_id,experiment_name,rag_type,model_name,judge_model,timestamp,question,is_impossible,generated_answer,...,exact_match,f1_score,judge_faithfulness,judge_relevance,judge_utility,judge_coherence,judge_similarity,judge_neg_rejection,judge_reasoning,retrieved_context_text
45,45,not_impossible_20251221_105432,not_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251221_105432,When was the final French and Indian War fought?,False,The French and Indian War was fought from 1754...,...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,The French and Indian War (1754‚Äì1763) was the ...
46,46,not_impossible_20251221_105432,not_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251221_105432,What was the Union's original war aim?,False,I cannot answer this based on the provided doc...,...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,American Civil War\nThe American Civil War was...
47,47,not_impossible_20251221_105432,not_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251221_105432,In what year was it decided that cardinal bish...,False,I cannot answer this based on the provided doc...,...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,Orders and their chief offices\nCardinal bisho...
48,48,not_impossible_20251221_105432,not_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251221_105432,Hyponatremia is the term that refers to which ...,False,I cannot answer this based on the provided doc...,...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,"Homeostasis\nFor any animal, survival requires..."
49,49,not_impossible_20251221_105432,not_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251221_105432,How much of Greece's energy consumption came f...,False,"In 2008, renewable energy accounted for 8% of ...",...,0,0.1538,0.0,0.0,0.0,0.0,0.0,,,Energy\nEnergy production in Greece is dominat...


In [7]:
### LLM AS A JUDGE
df_batch = df[start:end]
print(df_batch.shape) ### RUN AGAIN

# Run the judge
df_judged, summary_judged = evaluator.evaluate_batch(
    results_path_or_df=df_batch,
    judge_model="openai/gpt-oss-120b",
    target_rpm = 15
)
for k, v in summary_judged.items(): print(f"{k:<25}: {v:.4f}")

(10, 26)
‚è±Ô∏è  Refining Evaluations | Target RPM: 15 | Model: openai/gpt-oss-120b


Judging: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [05:03<00:00, 30.39s/it]

Saving evaluated results to judged/re_evaluated_results_20251222_090259.csv

üìä RE-EVALUATION SUMMARY
Total                    : 10.0000
Avg Faithfulness         : 0.5500
Avg Relevance            : 0.6000
Avg Context Utility      : 0.7400
Avg Coherence            : 0.9900
Avg Semantic Similarity  : 0.5000
Total API Calls          : 50.0000
üíæ Saved: judged/re_evaluated_results_20251222_090259.csv
Total                    : 10.0000
Avg Faithfulness         : 0.5500
Avg Relevance            : 0.6000
Avg Context Utility      : 0.7400
Avg Coherence            : 0.9900
Avg Semantic Similarity  : 0.5000
Total API Calls          : 50.0000





In [8]:
df40_50 = pd.read_csv('judged/re_evaluated_results_20251222_090259.csv')
df40_50[df40_50['judge_reasoning'].str.contains('Error')]['judge_reasoning']

Series([], Name: judge_reasoning, dtype: object)

In [9]:
### WITHOUT IMPOSSIBLE
df0_10 = pd.read_csv('judged/re_evaluated_results_20251221_111908.csv')
df10_20 = pd.read_csv('judged/re_evaluated_results_20251221_112553.csv')
df20_30 = pd.read_csv('judged/re_evaluated_results_20251221_113235.csv')
df30_40 = pd.read_csv('judged/re_evaluated_results_20251221_113834.csv')
df40_50 = pd.read_csv('judged/re_evaluated_results_20251222_090259.csv')
df_final_NI = pd.concat([df0_10, df10_20, df20_30, df30_40, df40_50], ignore_index=True)
df_final_NI.to_csv('final_results/NI_naive_rag_judged.csv', index=False)

## WITH IMPOSSIBLE Questions
only 50 samples can be used as its the api limit
and it contains the judge eval also

In [10]:
### WITH IMPOSSIBLE
with open("./data/raw/squad_eval_set_all.json", "r") as f:
    raw_data = json.load(f)
    
# Filter only validation data
eval_dataset = raw_data
# eval_dataset = [x for x in raw_data if x['is_impossible'] == False]
# eval_dataset = [x for x in raw_data if x['split'] == 'validation']
print(f"üìö Loaded {len(eval_dataset)} validation questions.")

del raw_data
gc.collect()

# ==========================================
# CELL 3: RUN EXPERIMENT 1 (Baseline)
# ==========================================

df1, summary = evaluator.run_experiment(
    dataset=eval_dataset,
    experiment_name="with_impossible",
    rag_type="naive_rag",
    model_name="llama-3.3-70b-versatile",
    # model_name="openai/gpt-oss-120b",
    sample_size=50,
    target_rpm=15,                     # Slower for safety (Judge adds calls)
    use_llm_judge=False,                # Enable Judge
    # judge_model = "llama-3.3-70b-versatile"
    # judge_model="openai/gpt-oss-20b"
    judge_model="openai/gpt-oss-120b" 
)

# View the first few rows of the saved data
df1.head()

üìö Loaded 142192 validation questions.
‚è±Ô∏è Target RPM: 15 | Sleep Base: 4.80s
Model: llama-3.3-70b-versatile, Judge: openai/gpt-oss-120b
üé≤ Sampling 50 questions...
üöÄ Experiment: with_impossible | Judge: False


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [04:50<00:00,  5.81s/it]


üìä EXPERIMENT SUMMARY
Total                    : 50.0000
Avg F1                   : 0.1293
Avg Exact Match          : 0.0600
Avg Hit Rate             : 0.7400
Avg Faithfulness         : 0.0000
Avg Relevance            : 0.0000
Avg Context Utility      : 0.0000
Avg Coherence            : 0.0000
Avg Semantic Similarity  : 0.0000
Total API Calls          : 50.0000
üíæ Saved: ./data/results/eval_with_impossible_naive_rag_20251222_090410.csv





Unnamed: 0,run_id,experiment_name,rag_type,model_name,judge_model,timestamp,question,is_impossible,generated_answer,gold_answers,...,exact_match,f1_score,judge_faithfulness,judge_relevance,judge_utility,judge_coherence,judge_similarity,judge_neg_rejection,judge_reasoning,retrieved_context_text
0,with_impossible_20251222_090410,with_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251222_090410,How long is the road that connects the largest...,True,I cannot answer this based on the provided doc...,[],...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,"Transport\nSince the 1980s, the road and rail ..."
1,with_impossible_20251222_090410,with_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251222_090410,When families were trying to ruin their immedi...,True,I cannot answer this based on the provided doc...,[],...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,Endemic species can be threatened with extinct...
2,with_impossible_20251222_090410,with_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251222_090410,Who was Wallis Simpson's second husband?,True,I cannot answer this based on the provided doc...,[],...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,"As Edward was unmarried and had no children, A..."
3,with_impossible_20251222_090410,with_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251222_090410,What does 'Pal Monqolica' mean?,True,I cannot answer this based on the provided doc...,[],...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,The Pala Empire (Bengali: ‡¶™‡¶æ‡¶≤ ‡¶∏‡¶æ‡¶Æ‡ßç‡¶∞‡¶æ‡¶ú‡ßç‡¶Ø Pal Sa...
4,with_impossible_20251222_090410,with_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251222_090410,What was the name of Tancred's nephew?,True,I cannot answer this based on the provided doc...,[],...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,Asclepius is the most famous son of Apollo. Hi...


In [11]:
df_impossible = pd.read_csv("data/results/eval_with_impossible_naive_rag_20251222_090410.csv")
df_impossible.to_csv("final_results/Imp_naive_rag.csv")
df_impossible[df_impossible['generated_answer'].str.contains('Error')]['generated_answer']

Series([], Name: generated_answer, dtype: object)

In [21]:
csv_path = "final_results/Imp_naive_rag.csv"
df = pd.read_csv(csv_path)
print(df.shape)
df[df['generated_answer'].str.contains('Error')]['generated_answer']

start = 20
end = 30
df_batch = df[start:end]
print(df_batch.shape)
df_batch.tail()

(50, 26)
(10, 26)


Unnamed: 0.1,Unnamed: 0,run_id,experiment_name,rag_type,model_name,judge_model,timestamp,question,is_impossible,generated_answer,...,exact_match,f1_score,judge_faithfulness,judge_relevance,judge_utility,judge_coherence,judge_similarity,judge_neg_rejection,judge_reasoning,retrieved_context_text
25,25,with_impossible_20251222_090410,with_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251222_090410,What do reception statutes state is the law of...,False,Reception statutes generally state that the co...,...,0,0.2424,0.0,0.0,0.0,0.0,0.0,,,American common law\nThe United States and mos...
26,26,with_impossible_20251222_090410,with_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251222_090410,How many soldiers from Napoleon's army were re...,False,"According to Source 3 (Napoleon), 1,500 soldie...",...,0,0.1429,0.0,0.0,0.0,0.0,0.0,,,"On 12 June Napoleon led about 124,000 men, kno..."
27,27,with_impossible_20251222_090410,with_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251222_090410,In which area was the 1995 report based?,True,I cannot answer this based on the provided doc...,...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,Oklahoma City is on the I-35 and I-40 corridor...
28,28,with_impossible_20251222_090410,with_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251222_090410,What was created when the interest charged was...,False,I cannot answer this based on the provided doc...,...,0,0.0,0.0,0.0,0.0,0.0,0.0,,,Commercial credit and agricultural consumer lo...
29,29,with_impossible_20251222_090410,with_impossible,naive_rag,llama-3.3-70b-versatile,openai/gpt-oss-120b,20251222_090410,The mechanical action of what allows insects t...,False,The mechanical action of appendages allows ins...,...,0,0.2,0.0,0.0,0.0,0.0,0.0,,,Insects were the earliest organisms to produce...


In [22]:
### LLM AS A JUDGE
df_batch = df[start:end]
print(df_batch.shape) ### RUN AGAIN

# Run the judge
df_judged, summary_judged = evaluator.evaluate_batch(
    results_path_or_df=df_batch,
    judge_model="openai/gpt-oss-120b",
    target_rpm = 15
)
for k, v in summary_judged.items(): print(f"{k:<25}: {v:.4f}")

(10, 26)
‚è±Ô∏è  Refining Evaluations | Target RPM: 15 | Model: openai/gpt-oss-120b


Judging: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [05:22<00:00, 32.25s/it]

Saving evaluated results to judged/re_evaluated_results_20251222_093232.csv

üìä RE-EVALUATION SUMMARY
Total                    : 10.0000
Avg Faithfulness         : 0.5800
Avg Relevance            : 0.4200
Avg Context Utility      : 0.6100
Avg Coherence            : 1.0000
Avg Semantic Similarity  : 0.9000
Total API Calls          : 54.0000
üíæ Saved: judged/re_evaluated_results_20251222_093232.csv
Total                    : 10.0000
Avg Faithfulness         : 0.5800
Avg Relevance            : 0.4200
Avg Context Utility      : 0.6100
Avg Coherence            : 1.0000
Avg Semantic Similarity  : 0.9000
Total API Calls          : 54.0000





In [23]:
df20_30 = pd.read_csv('judged/re_evaluated_results_20251222_093232.csv')
df20_30[df20_30['judge_reasoning'].str.contains('Error')]['judge_reasoning']

Series([], Name: judge_reasoning, dtype: object)

In [None]:
### WITH IMPOSSIBLE
df0_10 = pd.read_csv('judged/re_evaluated_results_20251222_091736.csv')
df10_20 = pd.read_csv('judged/re_evaluated_results_20251222_092623.csv')
df20_30 = pd.read_csv('judged/re_evaluated_results_20251222_093232.csv')
df30_40 = pd.read_csv('')
df40_50 = pd.read_csv('')
df_final_Imp = pd.concat([df0_10, df10_20, df20_30, df30_40, df40_50], ignore_index=True)
df_final_Imp.to_csv('final_results/Imp_naive_rag_judged.csv', index=False)

## COMBINING BOTH FILES

In [None]:
df_final_NI = pd.read_csv('final_results/NI_naive_rag_judged.csv')
df_final_Imp = pd.read_csv('final_results/Imp_naive_rag_judged.csv')
print(df_final_NI.shape)
print(df_final_Imp.shape)
df_final = pd.concat([df_final_NI, df_final_Imp], ignore_index=True)
print(df_final.shape)
df_final.to_csv('final_results/naive_rag_judged.csv', index=False)

In [None]:
### SUMMARY STATISTICS
