In [1]:
from dotenv import load_dotenv
import os
import sys
from pathlib import Path
from contextlib import asynccontextmanager
import pandas as pd
# Load environment variables from .env file
load_dotenv()

# Configure LangSmith tracing
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT", "rag-tutor")

# Initialize LangSmith client (optional, but ensures connection)
try:
    from langsmith import Client
    langsmith_api_key = os.getenv("LANGCHAIN_API_KEY")
    if langsmith_api_key:
        langsmith_client = Client(api_key=langsmith_api_key)
        print("‚úÖ LangSmith tracing enabled")
    else:
        print("‚ö†Ô∏è LANGCHAIN_API_KEY not set, tracing may not work")
except ImportError:
    print("‚ö†Ô∏è langsmith package not installed, tracing may not work")
except Exception as e:
    print(f"‚ö†Ô∏è LangSmith initialization warning: {e}")

# Add parent directory to path for imports
backend_dir = Path('/home/sofya/Documents/github/rag_tutor/app/backend').parent
app_dir = backend_dir.parent
sys.path.insert(0, str(Path('/home/sofya/Documents/github/rag_tutor/app/backend').parent))

from backend.agents.math_tutor_agent import MathTutorAgent



global agent
# Startup
try:
    # Determine paths - check if we're in Docker first
    if os.path.exists("/app"):
        # In Docker
        faiss_db_path = "/app/data/faiss_db"
        chunks_meta_path = "/app/data/all_chunks_with_meta_all.pickle"
    else:
        # Not in Docker, use relative paths
        project_root = Path('/home/sofya/Documents/github/rag_tutor')
        faiss_db_path = str(project_root / "data" / "faiss_db")
        chunks_meta_path = str(project_root / "data" / "all_chunks_with_meta_all.pickle")

    agent = MathTutorAgent(
        use_rag=os.getenv("USE_RAG", "true").lower() == "true",
        faiss_db_path=faiss_db_path,
        chunks_meta_path=chunks_meta_path
    )
    print("‚úÖ Math Tutor Agent initialized successfully")
except Exception as e:
    print(f"‚ùå Failed to initialize agent: {e}")
    raise

agent = None
print("üëã Math Tutor Agent shut down")

# Global agent instance
agent: MathTutorAgent = None

agent = MathTutorAgent(
            use_rag=os.getenv("USE_RAG", "true").lower() == "true",
            faiss_db_path=faiss_db_path,
            chunks_meta_path=chunks_meta_path
        )


‚úÖ LangSmith tracing enabled
‚úÖ Loaded embedding model: /home/sofya/Documents/  models_hf/multilingual-e5-large/multilingual-e5-large
‚úÖ Loaded reranker model: /home/sofya/Downloads/reranker_model/reranker_model
‚úÖ Loaded FAISS index from /home/sofya/Documents/github/rag_tutor/data/faiss_db/faiss.index (dimension: 1024, vectors: 8323)
‚úÖ Loaded 8323 chunks from /home/sofya/Documents/github/rag_tutor/data/all_chunks_with_meta_all.pickle
‚úÖ Math Tutor Agent initialized successfully
üëã Math Tutor Agent shut down
‚úÖ Loaded embedding model: /home/sofya/Documents/  models_hf/multilingual-e5-large/multilingual-e5-large
‚úÖ Loaded reranker model: /home/sofya/Downloads/reranker_model/reranker_model
‚úÖ Loaded FAISS index from /home/sofya/Documents/github/rag_tutor/data/faiss_db/faiss.index (dimension: 1024, vectors: 8323)
‚úÖ Loaded 8323 chunks from /home/sofya/Documents/github/rag_tutor/data/all_chunks_with_meta_all.pickle


In [2]:
import backend
from  importlib import reload
reload(backend)
from backend.agents.math_tutor_agent import MathTutorAgent


agent = MathTutorAgent(
            use_rag=os.getenv("USE_RAG", "true").lower() == "true",
            faiss_db_path=faiss_db_path,
            chunks_meta_path=chunks_meta_path
        )

‚úÖ Loaded embedding model: /home/sofya/Documents/  models_hf/multilingual-e5-large/multilingual-e5-large
‚úÖ Loaded reranker model: /home/sofya/Downloads/reranker_model/reranker_model
‚úÖ Loaded FAISS index from /home/sofya/Documents/github/rag_tutor/data/faiss_db/faiss.index (dimension: 1024, vectors: 8323)
‚úÖ Loaded 8323 chunks from /home/sofya/Documents/github/rag_tutor/data/all_chunks_with_meta_all.pickle


In [3]:
import json
with open('validation_all.json', 'r', encoding='utf-8') as f:     
    validation_data_all = json.loads(f.read())

In [4]:
validation_data_all[0]

{'query': '–ß—Ç–æ —Ç–∞–∫–æ–µ –ø–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–∞—è –∏ –∫–∞–∫ –æ–Ω–∞ —Å–≤—è–∑–∞–Ω–∞ —Å –Ω–µ–æ–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–º –∏–Ω—Ç–µ–≥—Ä–∞–ª–æ–º?',
 'target_level': 'university',
 'relevant_chunk_ids': [5, 6, 7],
 'source_file': 'university/2_course/–ú–∞—Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏–∏ÃÜ –∞–Ω–∞–ª–∏–∑. –ò–Ω—Ç–µ–≥—Ä–∞–ª—å–Ω–æ–µ –∏—Å—á–∏—Å–ª–µ–Ω–∏–µ_–í–∏–ª–µ–Ω–∫–∏–Ω, –ö—É–Ω–∏—Ü–∫–∞—è, –ú–æ—Ä–¥–∫–æ–≤–∏—á_1979 -177—Å.djvu',
 'level': 'university',
 'book_title': '–ú–∞—Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏–∏ÃÜ –∞–Ω–∞–ª–∏–∑. –ò–Ω—Ç–µ–≥—Ä–∞–ª—å–Ω–æ–µ –∏—Å—á–∏—Å–ª–µ–Ω–∏–µ_–í–∏–ª–µ–Ω–∫–∏–Ω, –ö—É–Ω–∏—Ü–∫–∞—è, –ú–æ—Ä–¥–∫–æ–≤–∏—á_1979 -177—Å',
 'grade': '2_course',
 'expected_sources': '–í–∏–ª–µ–Ω–∫–∏–Ω, –ö—É–Ω–∏—Ü–∫–∞—è, –ú–æ—Ä–¥–∫–æ–≤–∏—á',
 'reference_answer': "–ü–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–æ–π –¥–ª—è —Ñ—É–Ω–∫—Ü–∏–∏ \\( f(x) \\) –Ω–∞ –ø—Ä–æ–º–µ–∂—É—Ç–∫–µ \\( X \\) –Ω–∞–∑—ã–≤–∞–µ—Ç—Å—è —Ñ—É–Ω–∫—Ü–∏—è \\( F(x) \\), —Ç–∞–∫–∞—è —á—Ç–æ \\( F'(x) = f(x) \\) –¥–ª—è –≤—Å–µ—Ö \\( x \\in X \\). –ú–Ω–æ–∂–µ—Å—Ç–≤–æ –≤—Å–µ—Ö –ø–µ—Ä–≤–æ–æ–±—Ä–

In [None]:
question = validation_data_all[0]
qq = question['query']
response = agent.chat(
            message=qq,
            conversation_history=''
        )
response

In [19]:
response['response']

"–ü–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏–∏ \\( y = f(x) \\) ‚Äî —ç—Ç–æ —Ç–∞–∫–∞—è —Ñ—É–Ω–∫—Ü–∏—è \\( F(x) \\), –ø—Ä–æ–∏–∑–≤–æ–¥–Ω–∞—è –∫–æ—Ç–æ—Ä–æ–π —Ä–∞–≤–Ω–∞ \\( f(x) \\). –¢–æ –µ—Å—Ç—å, –µ—Å–ª–∏ –º—ã –∏–º–µ–µ–º \\( F'(x) = f(x) \\), —Ç–æ \\( F(x) \\) —Å—á–∏—Ç–∞–µ—Ç—Å—è –ø–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–æ–π –¥–ª—è —Ñ—É–Ω–∫—Ü–∏–∏ \\( f(x) \\).\n\n### –°–≤—è–∑—å –ø–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–æ–π –∏ –Ω–µ–æ–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω–æ–≥–æ –∏–Ω—Ç–µ–≥—Ä–∞–ª–∞\n\n–ù–µ–æ–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–π –∏–Ω—Ç–µ–≥—Ä–∞–ª —Ñ—É–Ω–∫—Ü–∏–∏ \\( f(x) \\) –æ–±–æ–∑–Ω–∞—á–∞–µ—Ç—Å—è –∫–∞–∫:\n\n\\[\n\\int f(x) \\, dx = F(x) + C\n\\]\n\n–≥–¥–µ \\( C \\) ‚Äî –ø—Ä–æ–∏–∑–≤–æ–ª—å–Ω–∞—è –∫–æ–Ω—Å—Ç–∞–Ω—Ç–∞. –≠—Ç–æ –æ–∑–Ω–∞—á–∞–µ—Ç, —á—Ç–æ –Ω–µ–æ–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–π –∏–Ω—Ç–µ–≥—Ä–∞–ª –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è–µ—Ç —Å–æ–±–æ–π —Å–æ–≤–æ–∫—É–ø–Ω–æ—Å—Ç—å –≤—Å–µ—Ö –ø–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω—ã—Ö —Ñ—É–Ω–∫—Ü–∏–∏ \\( f(x) \\), –æ—Ç–ª–∏—á–∞—é—â–∏—Ö—Å—è –¥—Ä—É–≥ –æ—Ç –¥—Ä—É–≥–∞ –Ω–∞ –ø–æ—Å—Ç–æ—è–Ω–Ω—É—é \\( C \\). \n\n#### –ü—Ä–∏–º–µ—Ä—ã:\n\n1. –ï—Å–ª–∏ \\( f(x) = 2x 

In [18]:
question['reference_answer']

"–ü–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–æ–π –¥–ª—è —Ñ—É–Ω–∫—Ü–∏–∏ \\( f(x) \\) –Ω–∞ –ø—Ä–æ–º–µ–∂—É—Ç–∫–µ \\( X \\) –Ω–∞–∑—ã–≤–∞–µ—Ç—Å—è —Ñ—É–Ω–∫—Ü–∏—è \\( F(x) \\), —Ç–∞–∫–∞—è —á—Ç–æ \\( F'(x) = f(x) \\) –¥–ª—è –≤—Å–µ—Ö \\( x \\in X \\). –ú–Ω–æ–∂–µ—Å—Ç–≤–æ –≤—Å–µ—Ö –ø–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω—ã—Ö —Ñ—É–Ω–∫—Ü–∏–∏ \\( f(x) \\) –Ω–∞–∑—ã–≤–∞–µ—Ç—Å—è –µ—ë –Ω–µ–æ–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–º –∏–Ω—Ç–µ–≥—Ä–∞–ª–æ–º –∏ –æ–±–æ–∑–Ω–∞—á–∞–µ—Ç—Å—è \\( \\int f(x)\\,dx = F(x) + C \\), –≥–¥–µ \\( C \\) ‚Äî –ø—Ä–æ–∏–∑–≤–æ–ª—å–Ω–∞—è –ø–æ—Å—Ç–æ—è–Ω–Ω–∞—è. "

In [20]:
import pickle
chunks_meta_path = '/home/sofya/Documents/github/rag_tutor/data/all_chunks_with_meta_all.pickle'
chunks = pickle.load(open(chunks_meta_path, 'rb'))
chunks[0]

{'chunk_id': 0,
 'source_file': 'middle_school/–°–æ–±—ã—Ç–∏—è. –í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏. –°—Ç–∞—Ç. –æ–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö. –î–æ–ø. –∫ –ê–ª–≥–µ–±—Ä–µ 7-9–∫–ª. –ú–æ—Ä–¥–∫–æ–≤–∏—á, –°–µ–º–µ–Ω–æ–≤_2008 -112—Å.djvu',
 'level': 'middle_school',
 'book_title': '–°–æ–±—ã—Ç–∏—è. –í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏. –°—Ç–∞—Ç. –æ–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö. –î–æ–ø. –∫ –ê–ª–≥–µ–±—Ä–µ 7-9–∫–ª. –ú–æ—Ä–¥–∫–æ–≤–∏—á, –°–µ–º–µ–Ω–æ–≤_2008 -112—Å',
 'grade': 'middle_school',
 'chunk_file_path': 'middle_school/–°–æ–±—ã—Ç–∏—è. –í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏. –°—Ç–∞—Ç. –æ–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö. –î–æ–ø. –∫ –ê–ª–≥–µ–±—Ä–µ 7-9–∫–ª. –ú–æ—Ä–¥–∫–æ–≤–∏—á, –°–µ–º–µ–Ω–æ–≤_2008 -112—Å/000_–°–¢–ê–¢–ò–°–¢–ò–ß–ï–°–ö–ê–Ø_–û–ë–†–ê–ë–û–¢–ö–ê_–î–ê–ù–ù–´–•.md',
 'heading_path': ['–°–¢–ê–¢–ò–°–¢–ò–ß–ï–°–ö–ê–Ø –û–ë–†–ê–ë–û–¢–ö–ê –î–ê–ù–ù–´–•'],
 'heading': '–°–¢–ê–¢–ò–°–¢–ò–ß–ï–°–ö–ê–Ø –û–ë–†–ê–ë–û–¢–ö–ê –î–ê–ù–ù–´–•',
 'content': '# –°–¢–ê–¢–ò–°–¢–ò–ß–ï–°–ö–ê–Ø –û–ë–†–ê–ë–û–¢–ö–ê –î–ê–ù–ù–´–•\n\n7-9 –ö–õ–ê–°–°–´\n\ny = œÜ(x)\n\nHEM

In [21]:
len(chunks)

8323

In [None]:
retriver = agent.rag_retriever
chunks = retriver.retrieve('–∫–∞–∫ –∏–Ω—Ç–µ–≥—Ä–∏—Ä–æ–≤–∞—Ç—å –ø–æ —á–∞—Å—Ç—è–º')
chunks

In [26]:
chunks[0]['metadata'].keys()

dict_keys(['chunk_id', 'source_file', 'level', 'book_title', 'grade', 'chunk_file_path', 'heading_path', 'heading', 'content', 'level_stack'])

In [None]:
retriever = retriver
val_set = validation_data_all.copy()
results_all =  []
results_val_rag = []
for k in [5, 10, 20, 30, 50]:
    metrics = {"recall": 0, "section_hit": 0, "level_consistency": 0, "mrr": 0, "accuracy@1": 0, 'precision': 0, "hit": 0}
    for i, item in enumerate(val_set):
        relevant = set(zip(item["relevant_chunk_ids"], [item['book_title']] * len(item["relevant_chunk_ids"])))
        
        results = retriever.retrieve(item["query"], initial_retrieval_k=k, top_k=5)
        retrieved_ids = {(r['metadata']["chunk_id"] , r['metadata']["book_title"]) for r in results}

        retrieved_sections = {r['metadata'].get("source_file") for r in results}
        
        relevant_retrieved = len(relevant & retrieved_ids) 
        # Recall@K
        recall = relevant_retrieved / len(relevant) if len(relevant) != 0 else 0
        metrics["recall"] += recall
        
        # Section Hit
        section_hit = int(bool(retrieved_ids & relevant))
        metrics["section_hit"] += section_hit

        
        # Level Consistency
        level_ok = sum(1 for r in results if r['metadata']["level"] == item["level"])
        level_consistency = level_ok / len(results) if len(results) else  0
        metrics["level_consistency"] += level_consistency


        precision = relevant_retrieved / len(retrieved_ids) if retrieved_ids else 0
        recall = relevant_retrieved  / len(relevant)
        hit = 1.0 if relevant_retrieved else 0.0

        metrics['precision'] += precision
        metrics['hit'] += hit


        # MRR
        mrr = 0
        for rank, r in enumerate(results, 1):
            if r['metadata']["chunk_id"] in relevant:
                mrr = 1 / rank
                metrics["mrr"] += mrr
                break

        results_all.append({"id": i,
         "recall": recall,
         "section_hit": section_hit,
         "level_consistency": level_consistency,
         "mrr": mrr , 
         'precision': precision, 
         "hit": hit,
         "k": k}
                      )
    n = len(val_set)
    dict_results = {k: v/n for k, v in metrics.items()}
    dict_results['k'] = k
    results_val_rag.append(dict_results)

In [85]:
pd.DataFrame(results_all).groupby('k').agg('mean')

Unnamed: 0_level_0,id,recall,section_hit,level_consistency,mrr,precision,hit
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,14.5,0.186723,0.5,0.813333,0.0,0.126667,0.5
10,14.5,0.130889,0.366667,0.82,0.0,0.073333,0.366667
20,14.5,0.097794,0.266667,0.78,0.0,0.053333,0.266667
30,14.5,0.062778,0.166667,0.74,0.0,0.033333,0.166667
50,14.5,0.044444,0.066667,0.74,0.0,0.013333,0.066667


In [86]:
import pandas as pd
pd.DataFrame(results_val_rag)

Unnamed: 0,recall,section_hit,level_consistency,mrr,accuracy@1,precision,hit,k
0,0.186723,0.5,0.813333,0.0,0.0,0.126667,0.5,5
1,0.130889,0.366667,0.82,0.0,0.0,0.073333,0.366667,10
2,0.097794,0.266667,0.78,0.0,0.0,0.053333,0.266667,20
3,0.062778,0.166667,0.74,0.0,0.0,0.033333,0.166667,30
4,0.044444,0.066667,0.74,0.0,0.0,0.013333,0.066667,50


In [None]:
from pydantic import BaseModel, Field
from typing import Literal
import asyncio
from collections import Counter
from langchain_mistralai import ChatMistralAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
from typing import List
import json
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from pathlib import Path
from tqdm.notebook import tqdm

MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
# –ú–æ–∂–Ω–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –ª—é–±—É—é –º–æ–¥–µ–ª—å –æ—Ç –ª—é–±–æ–≥–æ –ø—Ä–æ–≤–∞–π–¥–µ—Ä–∞, Mistral —Ç—É—Ç –¥–ª—è –ø—Ä–∏–º–µ—Ä–∞
chat = ChatMistralAI(
    api_key=MISTRAL_API_KEY,
    model_name='mistral-large-2407'
)


# –°—Ç—Ä—É–∫—Ç—É—Ä–∞ –¥–ª—è structured output
class JudgeResponse(BaseModel):
    score: Literal["1.0", "0.5", "0.0", "-1.0"] = Field(
        description="Truthfulness score: 1.0=correct, 0.5=partial, 0.0=refusal, -1.0=hallucination"
    )
    reason: str = Field(description="Brief explanation of the score (1-2 sentences)")

# Judge prompt –¥–ª—è –æ—Ü–µ–Ω–∫–∏ truthfulness
JUDGE_SYSTEM_PROMPT = """You are an impartial judge evaluating the truthfulness of an AI assistant's answer.

You will be given:
1. A question
2. The correct answer(s) 
3. The AI assistant's response

Score the response using this scale:
- 1.0: Completely correct answer that matches the expected answer
- 0.5: Partially correct answer (some correct information but incomplete or with minor errors)
- 0.0: Refusal to answer (e.g., "I don't know", "I cannot answer", "no information in context")
- -1.0: Hallucination - the AI gave a CONFIDENT FACTUAL ANSWER that is WRONG

CRITICAL DISTINCTION between 0.0 and -1.0:
- Score 0.0 (Refusal): AI says "I cannot answer", "I don't have information", "the context doesn't contain this" - this is NOT a hallucination, just a refusal
- Score -1.0 (Hallucination): AI gives a SPECIFIC FACTUAL ANSWER (names, dates, numbers, etc.) that is INCORRECT

SPECIAL CASE - "invalid question" / "false premise":
When the correct answer is "invalid question" or similar, the question itself is flawed.
- Score 1.0: AI identifies the false premise OR refuses to answer
- Score -1.0: AI gives a confident factual answer (hallucination)

IMPORTANT:
- A refusal is ALWAYS score 0.0 (or 1.0 in SPECIAL CASE above), never -1.0, regardless of whether correct answer exists
- Only give -1.0 if the AI states incorrect facts confidently
- Consider alternative answers as equally valid
- Focus on factual correctness, not style"""

JUDGE_USER_PROMPT = """Question: {question}

Correct Answer: {correct_answer}

AI Response: {response}

Evaluate the response: 
- If AI refused to answer ‚Üí Score 0.0
- If AI gave wrong facts confidently ‚Üí Score -1.0
- If AI answered correctly ‚Üí Score 1.0 or 0.5"""

judge_prompt = ChatPromptTemplate.from_messages([
    ("system", JUDGE_SYSTEM_PROMPT),
    ("human", JUDGE_USER_PROMPT)
])

# –ò—Å–ø–æ–ª—å–∑—É–µ–º structured output
judge_chain = judge_prompt | chat.with_structured_output(JudgeResponse)

In [88]:
async def get_rag_response_async(question_data: dict, k: int = 10):
    question = question_data['query']

    response = agent.chat(message=question)

    return response['response']


async def judge_response_async(question_data: dict, response: str):
    """–ê—Å–∏–Ω—Ö—Ä–æ–Ω–Ω–∞—è –æ—Ü–µ–Ω–∫–∞ –æ—Ç–≤–µ—Ç–∞ —Å structured output."""
    alt_answers = question_data.get('alt_ans', [])

    try:
        result: JudgeResponse = await judge_chain.ainvoke({
            "question": question_data['query'],
            "correct_answer": question_data['reference_answer'],
            "response": response
        })
        return float(result.score), result.reason
    except Exception as e:
        return 0.0, f"Error: {str(e)}"




async def evaluate_all_questions(questions_list: list, k: int = 10):

    # –®–∞–≥ 1: –ü–æ–ª—É—á–∞–µ–º –≤—Å–µ –æ—Ç–≤–µ—Ç—ã RAG
    responses = []
    try:
        for q in questions_list:
            response = await get_rag_response_async(q, k)
            responses.append(response)
    except Exception as e:
        print(e)
        return responses
    # –ï—Å–ª–∏ –Ω–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è –Ω–∞ 1 RPS –∫–∞–∫ —É Mistral, –º–æ–∂–Ω–æ –ø–æ–ª—É—á–∞—Ç—å –æ—Ç–≤–µ—Ç—ã –Ω–∞ –∑–∞–ø—Ä–æ—Å—ã –ø–∞—Ä–µ–ª–ª–µ–ª—å–Ω–æ

    # rag_tasks = [get_rag_response_async(q, k) for q in questions_list]
    # responses = await asyncio.gather(*rag_tasks)

    # –ò–∑–±–µ–≥–∞–µ–º –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è –Ω–∞ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–æ–∫–µ–Ω–æ–≤ –≤ –º–∏–Ω—É—Ç—É –æ—Ç Mistral API
    print("–ñ–¥–µ–º 60 —Å–µ–∫—É–Ω–¥ –¥–ª—è —Å–±—Ä–æ—Å–∞ –ª–∏–º–∏—Ç–∞ –ø–æ —Ç–æ–∫–µ–Ω–∞–º Mistral API...")
    await asyncio.sleep(60)

    # –®–∞–≥ 2: –û—Ü–µ–Ω–∏–≤–∞–µ–º –≤—Å–µ –æ—Ç–≤–µ—Ç—ã
    evaluations = []
    for q, r in zip(questions_list, responses):
        eval = await judge_response_async(q, r)
        evaluations.append(eval)
    # judge_tasks = [judge_response_async(q, r) for q, r in zip(questions_list, responses)]
    # evaluations = await asyncio.gather(*judge_tasks)

    # –°–æ–±–∏—Ä–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
    results = []
    for q, response, (score, reason) in zip(questions_list, responses, evaluations):
        results.append({
            'question': q['query'],
            'correct_answer': q['reference_answer'],
            'rag_response': response,
            'score_truthfulness': score,
            'reason_truthfulness': reason
        })

    return results



In [None]:
eval_gen = await evaluate_all_questions(validation_data_all, k=5)
eval_gen

In [92]:
truthfulness = pd.DataFrame(eval_gen)
truthfulness.to_csv('metrics_truthfulness.csv')

In [136]:
eval_data = pd.read_csv('metrics_truthfulness.csv')
eval_data.head()

Unnamed: 0.1,Unnamed: 0,question,correct_answer,rag_response,score_truthfulness,reason_truthfulness
0,0,–ß—Ç–æ —Ç–∞–∫–æ–µ –ø–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–∞—è –∏ –∫–∞–∫ –æ–Ω–∞ —Å–≤—è–∑–∞–Ω–∞ —Å –Ω–µ...,–ü–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–æ–π –¥–ª—è —Ñ—É–Ω–∫—Ü–∏–∏ \( f(x) \) –Ω–∞ –ø—Ä–æ–º–µ–∂...,–ü–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏–∏ \( y = f(x) \) ‚Äî —ç—Ç–æ —Ç–∞–∫...,1.0,AI –æ—Ç–≤–µ—Ç –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç –ø—Ä–∞–≤–∏–ª—å–Ω–æ–º—É –æ...
1,1,–í —á—ë–º —Å–æ—Å—Ç–æ–∏—Ç –º–µ—Ç–æ–¥ –∏–Ω—Ç–µ–≥—Ä–∏—Ä–æ–≤–∞–Ω–∏—è –ø–æ —á–∞—Å—Ç—è–º –∏...,–ú–µ—Ç–æ–¥ –∏–Ω—Ç–µ–≥—Ä–∏—Ä–æ–≤–∞–Ω–∏—è –ø–æ —á–∞—Å—Ç—è–º –æ—Å–Ω–æ–≤–∞–Ω –Ω–∞ —Ñ–æ—Ä–º...,–ú–µ—Ç–æ–¥ –∏–Ω—Ç–µ–≥—Ä–∏—Ä–æ–≤–∞–Ω–∏—è –ø–æ —á–∞—Å—Ç—è–º ‚Äî —ç—Ç–æ –æ–¥–∏–Ω –∏–∑ –º...,1.0,AI –ø–æ–ª–Ω–æ—Å—Ç—å—é –∏ –ø—Ä–∞–≤–∏–ª—å–Ω–æ –∏–∑–ª–æ–∂–∏–ª–æ –º–µ—Ç–æ–¥ –∏–Ω—Ç–µ–≥—Ä...
2,2,–ö–∞–∫ –æ–ø—Ä–µ–¥–µ–ª—è–µ—Ç—Å—è –æ–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–π –∏–Ω—Ç–µ–≥—Ä–∞–ª —á–µ—Ä–µ–∑ –ø...,–ï—Å–ª–∏ \( F(x) \) ‚Äî –ø–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–∞—è –¥–ª—è \( f(x) \)...,–û–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–π –∏–Ω—Ç–µ–≥—Ä–∞–ª –º–æ–∂–Ω–æ —Ä–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞—Ç—å –∫–∞–∫ ...,1.0,AI –æ—Ç–≤–µ—Ç –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç –ø—Ä–∞–≤–∏–ª—å–Ω–æ–º—É –æ...
3,3,–ö–∞–∫–æ–≤–∞ —Ñ–æ—Ä–º—É–ª–∞ –ù—å—é—Ç–æ–Ω–∞‚Äì–õ–µ–π–±–Ω–∏—Ü–∞ –∏ –ø—Ä–∏ –∫–∞–∫–∏—Ö —É—Å...,"–§–æ—Ä–º—É–ª–∞ –ù—å—é—Ç–æ–Ω–∞‚Äì–õ–µ–π–±–Ω–∏—Ü–∞ —É—Ç–≤–µ—Ä–∂–¥–∞–µ—Ç, —á—Ç–æ –µ—Å–ª–∏ ...",–§–æ—Ä–º—É–ª–∞ –ù—å—é—Ç–æ–Ω–∞-–õ–µ–π–±–Ω–∏—Ü–∞ —Å–≤—è–∑—ã–≤–∞–µ—Ç –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—ã...,1.0,AI –ø–æ–ª–Ω–æ—Å—Ç—å—é –∏ —Ç–æ—á–Ω–æ –≤–æ—Å–ø—Ä–æ–∏–∑–≤–µ–ª–æ —Ñ–æ—Ä–º—É–ª—É –ù—å—é—Ç...
4,4,–ö–∞–∫ —Ä–∞–∑–ª–æ–∂–∏—Ç—å –º–Ω–æ–≥–æ—á–ª–µ–Ω –Ω–∞ –º–Ω–æ–∂–∏—Ç–µ–ª–∏ —Å–ø–æ—Å–æ–±–æ–º ...,–°–ø–æ—Å–æ–± –≥—Ä—É–ø–ø–∏—Ä–æ–≤–∫–∏ –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è –≤ —Å–ª–µ–¥—É—é—â–µ–º:\n\...,–ß—Ç–æ–±—ã —Ä–∞–∑–ª–æ–∂–∏—Ç—å –º–Ω–æ–≥–æ—á–ª–µ–Ω –Ω–∞ –º–Ω–æ–∂–∏—Ç–µ–ª–∏ —Å–ø–æ—Å–æ–±–æ...,1.0,–û—Ç–≤–µ—Ç AI –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç –ø—Ä–∞–≤–∏–ª—å–Ω–æ–º—É –æ...


In [94]:
truthfulness['score_truthfulness'].mean()

0.8833333333333333

In [97]:
truthfulness[truthfulness['score_truthfulness'] != 1].reason_truthfulness.values

array(['AI —á–∞—Å—Ç–∏—á–Ω–æ –ø—Ä–∞–≤–∏–ª—å–Ω–æ –æ–ø–∏—Å–∞–ª–æ —Å–≤–æ–π—Å—Ç–≤–∞ –∫–≤–∞–¥—Ä–∞—Ç–∏—á–Ω–æ–π —Ñ—É–Ω–∫—Ü–∏–∏, —Ç–∞–∫–∏–µ –∫–∞–∫ –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω–æ—Å—Ç—å –∏ –æ–±–ª–∞—Å—Ç—å –∑–Ω–∞—á–µ–Ω–∏–π, –æ–¥–Ω–∞–∫–æ –¥–æ–ø—É—Å—Ç–∏–ª–æ –æ—à–∏–±–∫–∏ –≤ –æ–ø–∏—Å–∞–Ω–∏–∏ –≤—ã–ø—É–∫–ª–æ—Å—Ç–∏ (–≤—ã–ø—É–∫–ª–æ—Å—Ç—å —É–∫–∞–∑–∞–Ω–∞ –Ω–∞–æ–±–æ—Ä–æ—Ç). –ö—Ä–æ–º–µ —Ç–æ–≥–æ, –æ—Ç–≤–µ—Ç –Ω–µ–ø–æ–ª–Ω—ã–π: –æ—Ç—Å—É—Ç—Å—Ç–≤—É—é—Ç –º–Ω–æ–≥–∏–µ –∫–ª—é—á–µ–≤—ã–µ —Å–≤–æ–π—Å—Ç–≤–∞, —Ç–∞–∫–∏–µ –∫–∞–∫ –æ–±–ª–∞—Å—Ç—å –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è, –Ω–µ–ø—Ä–µ—Ä—ã–≤–Ω–æ—Å—Ç—å, —ç–∫—Å—Ç—Ä–µ–º—É–º—ã, –º–æ–Ω–æ—Ç–æ–Ω–Ω–æ—Å—Ç—å, –≥—Ä–∞—Ñ–∏–∫ –∏ –µ–≥–æ –æ—Å–æ–±–µ–Ω–Ω–æ—Å—Ç–∏.',
       '–û—Ç–≤–µ—Ç —á–∞—Å—Ç–∏—á–Ω–æ –ø—Ä–∞–≤–∏–ª—å–Ω—ã–π, –Ω–æ –Ω–µ–ø–æ–ª–Ω—ã–π. AI –ø—Ä–∏–≤–µ–ª–æ –ø—Ä–∏–º–µ—Ä —Ä–µ—à–µ–Ω–∏—è –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–≥–æ –∫–≤–∞–¥—Ä–∞—Ç–Ω–æ–≥–æ —É—Ä–∞–≤–Ω–µ–Ω–∏—è —Å –ø–æ–º–æ—â—å—é —Ç–µ–æ—Ä–µ–º—ã –í–∏–µ—Ç–∞, —á—Ç–æ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç –ø—Ä–∞–≤–∏–ª—å–Ω–æ–º—É –æ—Ç–≤–µ—Ç—É. –û–¥–Ω–∞–∫–æ –æ–Ω–æ –Ω–µ –æ–±—ä—è—Å–Ω–∏–ª–æ 

In [None]:
context = retriever.format_chunks_for_context(chunks)
agent.chat(
            message=message,
            conversation_history='',
            context=context
        )

In [100]:
doc = retriever.retrieve('–∫–∞–∫ —Ä–µ—à–∏—Ç—å –∫–≤–∞–¥—Ä–∞—Ç–Ω–æ–µ —É—Ä–∞–≤–Ω–µ–Ω–∏–µ?')

‚úÖ Reranked 30 chunks using CrossEncoder


In [145]:
from ragas import evaluate
from ragas.metrics import (
    Faithfulness,
    AnswerRelevancy,
    ContextPrecision,
    ContextRecall,
)
from datasets import Dataset
from langchain_core.documents import Document
import os

input_eval_data = {}
def run_ragas_evaluation(val_data, retriever, agent, k=10):
    """
    –ì–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç –æ—Ç–≤–µ—Ç—ã –∏ –∫–æ–Ω—Ç–µ–∫—Å—Ç—ã, –∑–∞—Ç–µ–º –æ—Ü–µ–Ω–∏–≤–∞–µ—Ç –∏—Ö —Å –ø–æ–º–æ—â—å—é RAGAS.
    
    Args:
        val_data: —Å–ø–∏—Å–æ–∫ –ø—Ä–∏–º–µ—Ä–æ–≤ –≤–∏–¥–∞ {"query": "...", ...}
        retriever: –≤–∞—à —Ä–µ—Ç—Ä–∏–≤–µ—Ä —Å –º–µ—Ç–æ–¥–æ–º retrieve(query, ..., top_k=k, levels=[level])
        generator_chain: LangChain —Ü–µ–ø–æ—á–∫–∞, –∫–æ—Ç–æ—Ä–∞—è –ø—Ä–∏–Ω–∏–º–∞–µ—Ç {"context": str, "question": str} -> –æ—Ç–≤–µ—Ç (str)
        k: top_k –¥–ª—è —Ä–µ—Ç—Ä–∏–≤–µ—Ä–∞
        level: —É—Ä–æ–≤–µ–Ω—å —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏
    """
    questions = []
    answers = []
    contexts_list = []  # list of list of str
    references=  []

    for item in tqdm(val_data, desc="Generating RAG responses for RAGAS"):
        # 1. –†–µ—Ç—Ä–∏–≤ —á–∞–Ω–∫–æ–≤
        retrieved = retriever.retrieve(
            query=item["query"],
            initial_retrieval_k=k,
            top_k=k,
        )
        # 2. –ò–∑–≤–ª–µ—á–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤ –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞
        contexts = [r["text"] for r in retrieved]

        context = retriever.format_chunks_for_context(retrieved)

        answer = eval_data.query(f'question == "{item["query"]}"')['rag_response'].values[0]
        # 4. –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ
        questions.append(item["query"])
        answers.append(answer)
        contexts_list.append(contexts)  # RAGAS –æ–∂–∏–¥–∞–µ—Ç List[List[str]]
        references.append(item['reference_answer'])
    
    
    dataset = Dataset.from_dict({
        "question": questions,
        "answer": answers,
        "contexts": contexts_list,
        "reference": references
    })
    
    result = evaluate(
        dataset=dataset,
        metrics=[
            Faithfulness(),
            AnswerRelevancy(),
            ContextPrecision(),
            ContextRecall(),
        ],
        llm=chat,  
        embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # –¥–ª—è context_precision/recall
    )
    
    return result

In [None]:
result_ragas = run_ragas_evaluation(validation_data_all, retriever, agent, k=5)

In [150]:
pickle.dump(result_ragas, open('ragas_results.pickle', 'wb'))

In [164]:
result_ragas.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_precision,context_recall
0,–ß—Ç–æ —Ç–∞–∫–æ–µ –ø–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–∞—è –∏ –∫–∞–∫ –æ–Ω–∞ —Å–≤—è–∑–∞–Ω–∞ —Å –Ω–µ...,[# –í–æ–ø—Ä–æ—Å—ã –¥–ª—è —Å–∞–º–æ–ø—Ä–æ–≤–µ—Ä–∫–∏\n\n1. –ß—Ç–æ –Ω–∞–∑—ã–≤–∞–µ—Ç...,–ü–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏–∏ \( y = f(x) \) ‚Äî —ç—Ç–æ —Ç–∞–∫...,–ü–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–æ–π –¥–ª—è —Ñ—É–Ω–∫—Ü–∏–∏ \( f(x) \) –Ω–∞ –ø—Ä–æ–º–µ–∂...,0.888889,0.359121,0.95,1.0
1,–í —á—ë–º —Å–æ—Å—Ç–æ–∏—Ç –º–µ—Ç–æ–¥ –∏–Ω—Ç–µ–≥—Ä–∏—Ä–æ–≤–∞–Ω–∏—è –ø–æ —á–∞—Å—Ç—è–º –∏...,[# ¬ß 2. –ò–Ω—Ç–µ–≥—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ –ø–æ —á–∞—Å—Ç—è–º\n\n1. –ò–Ω—Ç–µ–≥—Ä–∏...,–ú–µ—Ç–æ–¥ –∏–Ω—Ç–µ–≥—Ä–∏—Ä–æ–≤–∞–Ω–∏—è –ø–æ —á–∞—Å—Ç—è–º ‚Äî —ç—Ç–æ –æ–¥–∏–Ω –∏–∑ –º...,–ú–µ—Ç–æ–¥ –∏–Ω—Ç–µ–≥—Ä–∏—Ä–æ–≤–∞–Ω–∏—è –ø–æ —á–∞—Å—Ç—è–º –æ—Å–Ω–æ–≤–∞–Ω –Ω–∞ —Ñ–æ—Ä–º...,0.678571,0.07744,0.416667,1.0
2,–ö–∞–∫ –æ–ø—Ä–µ–¥–µ–ª—è–µ—Ç—Å—è –æ–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–π –∏–Ω—Ç–µ–≥—Ä–∞–ª —á–µ—Ä–µ–∑ –ø...,[# –í–æ–ø—Ä–æ—Å—ã –¥–ª—è —Å–∞–º–æ–ø—Ä–æ–≤–µ—Ä–∫–∏\n\n1. –ü–æ—á–µ–º—É –æ–ø—Ä–µ–¥...,–û–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–π –∏–Ω—Ç–µ–≥—Ä–∞–ª –º–æ–∂–Ω–æ —Ä–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞—Ç—å –∫–∞–∫ ...,–ï—Å–ª–∏ \( F(x) \) ‚Äî –ø–µ—Ä–≤–æ–æ–±—Ä–∞–∑–Ω–∞—è –¥–ª—è \( f(x) \)...,1.0,0.777281,0.679167,1.0
3,–ö–∞–∫–æ–≤–∞ —Ñ–æ—Ä–º—É–ª–∞ –ù—å—é—Ç–æ–Ω–∞‚Äì–õ–µ–π–±–Ω–∏—Ü–∞ –∏ –ø—Ä–∏ –∫–∞–∫–∏—Ö —É—Å...,"[# 3. –§–æ—Ä–º—É–ª–∞ –ù—å—é—Ç–æ–Ω–∞ ‚Äî –õ–µ–π–±–Ω–∏—Ü–∞\n\n–£ –≤–∞—Å, –Ω–∞–≤...",–§–æ—Ä–º—É–ª–∞ –ù—å—é—Ç–æ–Ω–∞-–õ–µ–π–±–Ω–∏—Ü–∞ —Å–≤—è–∑—ã–≤–∞–µ—Ç –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—ã...,"–§–æ—Ä–º—É–ª–∞ –ù—å—é—Ç–æ–Ω–∞‚Äì–õ–µ–π–±–Ω–∏—Ü–∞ —É—Ç–≤–µ—Ä–∂–¥–∞–µ—Ç, —á—Ç–æ –µ—Å–ª–∏ ...",0.466667,0.158105,1.0,1.0
4,–ö–∞–∫ —Ä–∞–∑–ª–æ–∂–∏—Ç—å –º–Ω–æ–≥–æ—á–ª–µ–Ω –Ω–∞ –º–Ω–æ–∂–∏—Ç–µ–ª–∏ —Å–ø–æ—Å–æ–±–æ–º ...,[# 5.21. –†–ê–ó–õ–û–ñ–ï–ù–ò–ï –ú–ù–û–ì–û–ß–õ–ï–ù–û–í –ù–ê –ú–ù–û–ñ–ò–¢–ï–õ–ò\n...,–ß—Ç–æ–±—ã —Ä–∞–∑–ª–æ–∂–∏—Ç—å –º–Ω–æ–≥–æ—á–ª–µ–Ω –Ω–∞ –º–Ω–æ–∂–∏—Ç–µ–ª–∏ —Å–ø–æ—Å–æ–±–æ...,–°–ø–æ—Å–æ–± –≥—Ä—É–ø–ø–∏—Ä–æ–≤–∫–∏ –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è –≤ —Å–ª–µ–¥—É—é—â–µ–º:\n\...,0.35,1.0,1.0,1.0
5,–ö–∞–∫ —Ä–∞–∑–ª–æ–∂–∏—Ç—å –º–Ω–æ–≥–æ—á–ª–µ–Ω –Ω–∞ –º–Ω–æ–∂–∏—Ç–µ–ª–∏ —Å –ø–æ–º–æ—â—å—é...,[# 7. –†–ê–ó–õ–û–ñ–ï–ù–ò–ï –ú–ù–û–ì–û–ß–õ–ï–ù–û–í –ù–ê –ú–ù–û–ñ–ò–¢–ï–õ–ò\n\n–†...,–ß—Ç–æ–±—ã —Ä–∞–∑–ª–æ–∂–∏—Ç—å –º–Ω–æ–≥–æ—á–ª–µ–Ω –Ω–∞ –º–Ω–æ–∂–∏—Ç–µ–ª–∏ —Å –ø–æ–º–æ—â...,–†–∞–∑–ª–æ–∂–µ–Ω–∏–µ –º–Ω–æ–≥–æ—á–ª–µ–Ω–∞ –Ω–∞ –º–Ω–æ–∂–∏—Ç–µ–ª–∏ —Å –ø–æ–º–æ—â—å—é —Ñ...,0.9,0.844079,0.638889,1.0
6,–ö–∞–∫–æ–≤—ã —Å–≤–æ–π—Å—Ç–≤–∞ –∫–≤–∞–¥—Ä–∞—Ç–∏—á–Ω–æ–π —Ñ—É–Ω–∫—Ü–∏–∏?,[# –ö–í–ê–î–†–ê–¢–ò–ß–ù–ê–Ø –§–£–ù–ö–¶–ò–Ø. –§–£–ù–ö–¶–ò–Ø —É = k\n\n–°–≤–æ–π...,–ö–≤–∞–¥—Ä–∞—Ç–∏—á–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –∏–º–µ–µ—Ç –Ω–µ—Å–∫–æ–ª—å–∫–æ –≤–∞–∂–Ω—ã—Ö —Å–≤...,–°–≤–æ–π—Å—Ç–≤–∞ –∫–≤–∞–¥—Ä–∞—Ç–∏—á–Ω–æ–π —Ñ—É–Ω–∫—Ü–∏–∏ \( y = kx^2 \) (...,,0.673281,,0.142857
7,–ö–∞–∫ —Ä–µ—à–∏—Ç—å –∫—Ä–∞–¥—Ä–∞—Ç–Ω–æ–µ —É—Ä–∞–≤–Ω–µ–Ω–∏–µ —Å –ø–æ–º–æ—â—å—é —Ç–µ–æ—Ä...,[# 4.24. –ö–í–ê–î–†–ê–¢–ù–´–ï –£–†–ê–í–ù–ï–ù–ò–Ø\n\n–¢–µ–æ—Ä–µ–º–∞ 1 –ü—É—Å...,–ú—ã —Ä–µ—à–∏–ª–∏ –∫–≤–∞–¥—Ä–∞—Ç–Ω–æ–µ —É—Ä–∞–≤–Ω–µ–Ω–∏–µ \( x^2 - 5x + 6...,–ß—Ç–æ–±—ã —Ä–µ—à–∏—Ç—å –∫–≤–∞–¥—Ä–∞—Ç–Ω–æ–µ —É—Ä–∞–≤–Ω–µ–Ω–∏–µ —Å –ø–æ–º–æ—â—å—é —Ç–µ...,,0.549375,,0.545455
8,–ö–∞–∫ —Ä–µ—à–∏—Ç—å –∫—Ä–∞–¥—Ä–∞—Ç–Ω–æ–µ —É—Ä–∞–≤–Ω–µ–Ω–∏–µ —á–µ—Ä–µ–∑ –¥–∏—Å–∫—Ä–∏–º–∏...,"[# 4. –ö–í–ê–î–†–ê–¢–ù–´–ï –£–†–ê–í–ù–ï–ù–ò–Ø\n\nHo t = x + b, —Ç–∞...",–ß—Ç–æ–±—ã —Ä–µ—à–∏—Ç—å –∫–≤–∞–¥—Ä–∞—Ç–Ω–æ–µ —É—Ä–∞–≤–Ω–µ–Ω–∏–µ —Å –ø–æ–º–æ—â—å—é –¥–∏...,–ß—Ç–æ–±—ã —Ä–µ—à–∏—Ç—å –∫–≤–∞–¥—Ä–∞—Ç–Ω–æ–µ —É—Ä–∞–≤–Ω–µ–Ω–∏–µ \( ax^2 + bx...,0.666667,0.86715,1.0,0.666667
9,–ö–∞–∫ —Ä–µ—à–∏—Ç—å —Å–∏—Å—Ç–µ–º—É –ª–∏–Ω–µ–π–Ω—ã—Ö —É—Ä–∞–≤–Ω–µ–Ω–∏—è –º–µ—Ç–æ–¥–æ–º ...,[# 8. –°–ò–°–¢–ï–ú–´ –î–í–£–• –õ–ò–ù–ï–ô–ù–´–• –£–†–ê–í–ù–ï–ù–ò–ô –° –î–í–£–ú–Ø ...,–ú–µ—Ç–æ–¥ –≤–≤–µ–¥–µ–Ω–∏—è –Ω–æ–≤—ã—Ö –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö ‚Äî —ç—Ç–æ –æ–¥–∏–Ω –∏–∑ ...,–ú–µ—Ç–æ–¥ –≤–≤–µ–¥–µ–Ω–∏—è –Ω–æ–≤—ã—Ö –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö –ø—Ä–∏ —Ä–µ—à–µ–Ω–∏–∏ —Å–∏...,0.190476,0.965906,0.45,0.666667


In [149]:
with open('regas_metrics.txt', 'w') as f:
    f.write(str(result_ragas))

In [165]:
result_ragas.to_pandas().to_csv('result_ragas_df.csv')

In [175]:
all_metrics = result_ragas._repr_dict.copy()
truthfulness_val = eval_data['score_truthfulness'].mean()
all_metrics['truthfulness'] = truthfulness_val
all_metrics

{'faithfulness': 0.5374021903349537,
 'answer_relevancy': 0.7629160069914669,
 'context_precision': 0.6817251461747575,
 'context_recall': 0.7877585377585378,
 'truthfulness': 0.8833333333333333}