In [13]:
import sys
from typing import Dict
import json
sys.path.append("..")

from evaluator.base_evaluator import RAGEvaluator
from utils.llm import OpenAIClientLLM
from evaluator.prompt_manager import EvaluationType, EvalPromptManager

class LLMRelevanceEvaluator(RAGEvaluator):
    def pre_process(
        self,
        question: str,
        context: str,
        answer: str
    ) -> str:
        return self.prompt_manager.build_prompt(
            question=question,
            context=context,
            answer=answer,
            eval_type=EvaluationType.RELEVANCE  # or make this configurable
        )
    def call_llm(self, processed_data: str) -> str:
        # Execute LLM call with constructed prompt
        return self.llm.generate(processed_data)
    
    def post_process(self, llm_response: str) -> Dict[str, float]:
        """Parse JSON response into scores dictionary"""
        try:
            # Clean response and parse JSON
            response_text = llm_response.strip().replace('```json', '').replace('```', '')
            result = json.loads(response_text)
            
            # Normalize scores and flatten structure
            scores = {
                'score': result.get('score', 
                           result.get('relevance_score', 
                           result.get('coherence_score', 
                           result.get('accuracy_score', 0.0)))),
                'confidence': result.get('confidence', 0.0)
            }
            
            # Add additional metrics
            for key in result:
                if key.endswith('_score') and key != 'score':
                    scores[key] = result[key]
            
            return scores
            
        except (json.JSONDecodeError, KeyError) as e:
            logger.info(f"Error parsing LLM response: {e}")
            return {
                'score': 0.0,
                'confidence': 0.0,
                'error': str(e)
            }
    

In [14]:
from datasets import load_dataset
delucionqa = load_dataset("rungalileo/ragbench", "delucionqa")
df = delucionqa['train'].to_pandas()
a = df.head()
a['flatten_doc'] = a.apply(lambda x: "\n".join([f"`{label}` {sentence}" for label, sentence in [inner_list for middle_list in x['documents_sentences'] for inner_list in middle_list]]), axis = 1)
answer = a.iloc[1]['response']
documents = a.iloc[1]['flatten_doc']
question = a.iloc[1]['question']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['flatten_doc'] = a.apply(lambda x: "\n".join([f"`{label}` {sentence}" for label, sentence in [inner_list for middle_list in x['documents_sentences'] for inner_list in middle_list]]), axis = 1)


In [10]:
logger.info(answer, "\n\n")
logger.info(documents,  "\n\n")
logger.info(question, "\n\n")

To customize the Uconnect system based on your own preferences, you can follow these steps:

1. Press the apps button on the touchscreen to open the app screen.
2. Press and hold the selected app, then drag it to replace an existing shortcut in the main menu bar.
3. Note that this feature is only available when the vehicle is in PARK. 


`0a`  Uconnect 4 with 7-inch display and uconnect 4/4c/4c nav with 8.4-inch display press the apps button, then press the settings button on the touchscreen to display the menu setting screen.
`0b` In this mode the Uconnect system allows you to access programmable features.
`0c` When making a selection, only press one button at a time to enter the desired menu.
`0d` Once in the desired menu, press and release the preferred setting option until a check mark appears next to the setting, showing that setting has been selected.
`0e` Once the setting is complete, press the X button on the touchscreen to close out of the settings screen.
`0f` Pressing the Up

In [7]:
import dotenv
dotenv.load_dotenv()

evaluator = LLMRelevanceEvaluator(
    llm=OpenAIClientLLM(),
    prompt_manager=EvalPromptManager(default_type=EvaluationType.FACTUAL_ACCURACY)
)


result = evaluator.evaluate(
    question=question,
    context=documents,
    answer=answer,
)

In [8]:
result

{'score': 0.9, 'confidence': 0.95, 'relevance_score': 0.9}

In [None]:
from typing import List


class LLMEquivalenceEvaluator(RAGEvaluator):
    def pre_process(
        self,
        question: str|List[str],
        context: str|List[str],
        answer: str|List[str]
    ) -> str:
        assert len(answer) == 2
        two_line_answer = f"    1. {answer[0]}\n    2. {answer[1]}"
        return self.prompt_manager.build_prompt(
            question=question,
            context=context,
            answer=two_line_answer,
            eval_type=EvaluationType.ANSWER_EQUIVALENCE
        )
        
    def call_llm(self, processed_data: str) -> str:
        # Execute LLM call with constructed prompt
        return self.llm.generate(processed_data)
    
    def post_process(self, llm_response: str) -> Dict[str, float]:
        """Parse JSON response into scores dictionary"""
        try:
            # Clean response and parse JSON
            response_text = llm_response.strip().replace('```json', '').replace('```', '')
            result = json.loads(response_text)
            
            scores = {
                "Q1": 1 if result['Q1'] == 'yes' else 0,
                "Q2": 1 if result['Q2'] == 'yes' else 0,
                "Q3": 1 if result['Q3'] == 'yes' else 0,
                "Q4": 1 if result['Q4'] == 'yes' else 0,
            }
            
            return scores
            
        except (json.JSONDecodeError, KeyError) as e:
            logger.info(f"Error parsing LLM response: {response_text}")
            return {
                "Q1": 0, "Q2": 0, "Q3": 0, "Q4": 0,
                'error': str(e)
            }
        
    