In [None]:
from app.run_evaluation import run_manual_review, run_llm_evaluation

In [None]:
from phoenix import Session, StorageConfig, S3LogConfig
from phoenix.evaluation import EvaluationFramework, ModelBasedEvaluator, ManualEvaluator, CustomMetric
from phoenix.store import DocumentStore
from dataclasses import dataclass
from typing import Dict, List
import os
import boto3
import json
import uuid
from botocore.exceptions import ClientError

@dataclass
class Review:
    response_id: str
    project_id: str
    reviewer_id: str
    model_response: str
    correction: str
    score_type: str
    score: int
    explanation: str = ""
    review_type: str = "manual"
    review_model: str = "human"

def load_config():
   # Load DB credentials from parameters.json
   with open('parameters.json') as f:
       params = json.load(f)
       for param in params:
           if param['ParameterKey'] == 'DBUsername':
               os.environ['DB_USER'] = param['ParameterValue']
           if param['ParameterKey'] == 'DBPassword':
               os.environ['DB_PASSWORD'] = param['ParameterValue']

   # Get stack name from parameters.json
   stack_name = None
   with open('parameters.json') as f:
       params = json.load(f)
       for param in params:
           if param['ParameterKey'] == 'S3BucketName':
               stack_name = param['ParameterValue']
               break

   # Get DB endpoint and bucket name from CloudFormation
   cfn = boto3.client('cloudformation')
   response = cfn.describe_stacks(StackName=stack_name)
   outputs = response['Stacks'][0]['Outputs']
   
   for output in outputs:
       if output['OutputKey'] == 'DBEndpoint':
           os.environ['RDS_HOST'] = output['Value']
       if output['OutputKey'] == 'BucketName':
           os.environ['S3_BUCKET'] = output['Value']

   # Set region from boto3 session
   session = boto3.session.Session()
   os.environ['AWS_REGION'] = session.region_name

In [None]:
# Manual review
result = run_manual_review(
    project_id="project_123",
    reviewer_id="analyst_1",
    model_response="Response to evaluate",
    correction="Corrected response",
    score_type="accuracy",
    score=4,
    explanation="Minor inaccuracies"
)

# LLM evaluation
llm_result = run_llm_evaluation(
    project_id="project_123",
    model_response="Response to evaluate",
    models=["gpt-4", "claude-3"]
)