In [13]:
import pandas as pd

import vertexai
from vertexai.evaluation import EvalTask, PointwiseMetric, PointwiseMetricPromptTemplate
from google.cloud import aiplatform
import os

In [14]:
PROJECT_ID = "poc-mulia-ceramics-ai"
LOCATION = "asia-southeast1"
EXPERIMENT_NAME = "sandbox-experiment"

vertexai.init(
    project=PROJECT_ID,
    location=LOCATION,
)

In [15]:
relevance_metric = PointwiseMetric(
    metric="relevance_metric",
    metric_prompt_template=PointwiseMetricPromptTemplate(
        criteria={
            "relevance": (
                "How well does the response address the user's query? "
                "Responses should stay on-topic, directly answer the question or request, "
                "and provide meaningful information relevant to the user's input."
            )
        },
        rating_rubric={
            "5": "Fully on-topic, answers the query directly and thoroughly.",
            "4": "Mostly on-topic, with only minor tangents or gaps.",
            "3": "Partially relevant, some off-topic content or lacking specificity.",
            "2": "Mostly irrelevant or misses the point of the query.",
            "1": "Completely off-topic and unrelated to the user's query."
        },
        input_variables=["chat_history"],
    )
)

In [16]:
output_completeness_metric = PointwiseMetric(
    metric="output_completeness_metric",
    metric_prompt_template=PointwiseMetricPromptTemplate(
        criteria={
            "output_completeness": (
                "Does the response include all essential information required by the user? "
                "This includes details like product name, price, link, or other key content expected by the user."
            )
        },
        rating_rubric={
            "5": "All essential details are included with no omissions.",
            "4": "Most essential information is present, with minor omissions.",
            "3": "Some key information is missing.",
            "2": "Several important details are missing.",
            "1": "Essential information is largely or entirely missing."
        },
        input_variables=["chat_history"],
    )
)

In [17]:
clarity_and_tone_metric = PointwiseMetric(
    metric="clarity_and_tone_metric",
    metric_prompt_template=PointwiseMetricPromptTemplate(
        criteria={
            "clarity_and_tone": (
                "Is the response clear, professional, and easy to understand? "
                "The tone should be appropriate to the context — polite, helpful, and not overly casual or filled with jargon."
            )
        },
        rating_rubric={
            "5": "Very clear and professional tone; highly readable.",
            "4": "Generally clear and appropriate, with minor room for improvement.",
            "3": "Some parts are unclear or tone slightly inconsistent.",
            "2": "Often unclear, with tone that feels inappropriate or inconsistent.",
            "1": "Hard to understand or unprofessional in tone."
        },
        input_variables=["chat_history"],
    )
)

In [18]:
request_fulfillment_metric = PointwiseMetric(
    metric="request_fulfillment_metric",
    metric_prompt_template=PointwiseMetricPromptTemplate(
        criteria={
            "request_fulfillment": (
                "To what extent does the response fulfill the user's specific intent? "
                "Responses should show a good understanding of the user's goal and deliver helpful, tailored answers."
            )
        },
        rating_rubric={
            "5": "Fully satisfies the user's request and intent.",
            "4": "Satisfies most aspects of the user's request.",
            "3": "Somewhat fulfills the request but lacks depth or precision.",
            "2": "Fails to fully understand or address the user's intent.",
            "1": "Does not fulfill the request at all."
        },
        input_variables=["chat_history"],
    )
)

In [19]:
structure_and_flow_metric = PointwiseMetric(
    metric="structure_and_flow_metric",
    metric_prompt_template=PointwiseMetricPromptTemplate(
        criteria={
            "structure_and_flow": (
                "Is the response well-structured and logically organized? "
                "Information should be presented in a clear sequence that's easy to read and understand."
            )
        },
        rating_rubric={
            "5": "Excellent organization, very easy to follow.",
            "4": "Generally well-organized with minor improvements needed.",
            "3": "Moderate structure, some disorganization.",
            "2": "Poor structure, difficult to follow.",
            "1": "Disorganized and confusing to read."
        },
        input_variables=["chat_history"],
    )
)

In [31]:
import os
import pandas as pd

folder_path = "chat_history"
all_rows = []

for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)

        if "role" not in df.columns or "message" not in df.columns:
            continue

        # Sort if timestamp exists (optional, not in your sample)
        if "timestamp" in df.columns:
            df = df.sort_values(by="timestamp")
        
        chat_history = []
        for _, row in df.iterrows():
            role = row['role'].strip().lower()
            message = row['message']

            if role == 'assistant':
                history_text = "\n".join(chat_history)
                all_rows.append({
                    "chat_history": history_text,
                    "response": message
                })

            speaker = "User" if role == "user" else "Agent"
            chat_history.append(f"{speaker}: {message}")

eval_dataset = pd.DataFrame(all_rows)
eval_dataset.head()


Unnamed: 0,chat_history,response
0,User: Do you have comic about silver surfer?,"Hey there, fellow Marvel fan! I've got some a..."
1,User: Do you have comic about silver surfer?\n...,Great choice! Silver Surfer is a classic! Ba...
2,User: Do you have comic about silver surfer?\n...,Yes! I found a Silver Surfer comic that fits y...
3,User: What is the story of Iron Man (1968) #27...,"I'm sorry, but I don't have information about ..."
4,User: What is the story of Iron Man (1968) #27...,"Based on the provided data, I only have inform..."


In [32]:
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[relevance_metric, 
             output_completeness_metric, 
             clarity_and_tone_metric, 
             request_fulfillment_metric, 
             structure_and_flow_metric
            ],
    experiment=EXPERIMENT_NAME
)

pointwise_result = eval_task.evaluate()


Associating projects/1021241277325/locations/asia-southeast1/metadataStores/default/contexts/sandbox-experiment-d439983c-e2c9-4969-b630-69ea51d3a99b to Experiment: sandbox-experiment


Computing metrics with a total of 80 Vertex Gen AI Evaluation Service API requests.


100%|██████████| 80/80 [00:09<00:00,  8.22it/s]

All 80 metric requests are successfully computed.
Evaluation Took:9.749766286000067 seconds



