In [8]:
import json
import pandas as pd
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams
from deepeval.metrics import GEval
from deepeval import assert_test

# Load model outputs — each element is a JSON string
# with open("/Users/zhangran/Desktop/BP@UnitedStates/Code/D2D_Data2Dashboard/exp_result/exp01_d2insight_sys_stimhartnow_result.json", "r") as f:
#     output_sys = json.load(f)

# # Load model outputs — each element is a JSON string
# with open("/Users/zhangran/Desktop/BP@UnitedStates/Code/D2D_Data2Dashboard/exp_result/exp01_d2insight_gpt4o_domain_stimhartnow_result.json", "r") as f:
#     output_gpt4o = json.load(f)

insight_sys = '''1. **Investment Preferences by Gender**: We want to explore if there are differences in investment preferences between genders. A stacked bar chart can effectively show the distribution of investment avenues (Mutual Funds, Equity Market, etc.) by gender. This will help us understand if certain investment types are more popular among males or females.
2. **Age Distribution and Investment Choices**: The age range is relatively narrow, but it's important to see how age correlates with investment choices. A box plot can show the distribution of ages for different investment avenues, highlighting any age-related trends in investment behavior.
3. **Investment Monitoring Frequency**: Understanding how frequently individuals monitor their investments can provide insights into their engagement level. A pie chart showing the distribution of investment monitoring frequencies (Daily, Weekly, Monthly) will illustrate this aspect.
4. **Savings Objectives and Investment Avenues**: It's crucial to see how savings objectives align with chosen investment avenues. A heatmap can display the relationship between savings objectives and investment avenues, showing which objectives are associated with which types of investments.
5. **Expected Returns and Investment Preferences**: Analyzing the expected returns against investment preferences can reveal risk tolerance levels. A scatter plot with jitter can show the relationship between expected returns and the number of investments in different avenues.'''

insight_human = '''We can see from the above graph that most of our respondents were in the ages 24-31 and we will look further their habits in the finance field. The above trend shows us that people are less worried about risks involved and more worried about the money which they will get back. We can see that high number of males are dependent on Returns. From the above plot it is clear that as people grow older they are more inclined towards Returns and people at young age look more for wealth creation'''

# 3. Create a test case without reference output
test_case_sys = LLMTestCase(
    input="Personal finance investment survey dataset with columns: gender	age	Investment_Avenues	Mutual_Funds	Equity_Market	Debentures	Government_Bonds	Fixed_Deposits	PPF	Gold	...	Duration	Invest_Monitor	Expect	Avenue	What are your savings objectives?	Reason_Equity	Reason_Mutual	Reason_Bonds	Reason_FD	Source",
    actual_output=insight_sys,
)

test_case_human = LLMTestCase(
    input="Personal finance investment survey dataset with columns: gender	age	Investment_Avenues	Mutual_Funds	Equity_Market	Debentures	Government_Bonds	Fixed_Deposits	PPF	Gold	...	Duration	Invest_Monitor	Expect	Avenue	What are your savings objectives?	Reason_Equity	Reason_Mutual	Reason_Bonds	Reason_FD	Source",
    actual_output=insight_human,
)

In [9]:
# 4. Define GEval metrics (self-evaluation — no expected_output)
insightful = GEval(
    name="Insightful",
    criteria="Does the output offer a deep or non-obvious understanding? Does it connect patterns or trends that aren't immediately apparent?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

novelty = GEval(
    name="Novelty",
    criteria="Does the output go beyond generic interpretation? Would it surprise or teach something new to a domain expert?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

depth = GEval(
    name="Depth",
    criteria="Does the analysis demonstrate deep insight",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [10]:
# # 5. Run evaluation (assertion-based print)
# print("\n=== Insight Evaluation Report ===")
# assert_test(test_case_sys, [insightful, novelty, domain_relevance])

In [11]:
from deepeval import evaluate

results_sys = evaluate(
    test_cases=[test_case_sys],
    metrics=[insightful, novelty, depth],
)

# ── pretty‑print ───────────────────────────────
for label, test_case_list in results_sys:
    print(f"\n==== Label: {label} ====")

    if test_case_list is None:
        print("⚠️  No results for this label.\n")
        continue

    for test_case in test_case_list:
        print(f"Input:  {test_case.input}")
        print(f"Output: {test_case.actual_output[:300]}...")

        for metric in test_case.metrics_data:
            print(f"{metric.name:<25}: {metric.score:.2f}  |  {metric.reason}")





Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.86s/test case]



Metrics Summary

  - ✅ Insightful (GEval) (score: 0.8179717012599503, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output identifies connections such as gender differences in investment preferences and age-related trends, which are not explicitly stated in the input. It also unveils patterns like the relationship between savings objectives and investment avenues, and expected returns with investment preferences, going beyond the immediate information provided., error: None)
  - ❌ Novelty (GEval) (score: 0.3964374642639535, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output provides basic insights like gender differences and age distribution in investment preferences, but lacks specific examples or novel conclusions. It suggests common visualization techniques without introducing innovative approaches or perspectives that require additional thinking from a domain expert., error: None)
  - ✅ Depth (GEval) (score: 0.733854962590572, thres





==== Label: test_results ====
Input:  Personal finance investment survey dataset with columns: gender	age	Investment_Avenues	Mutual_Funds	Equity_Market	Debentures	Government_Bonds	Fixed_Deposits	PPF	Gold	...	Duration	Invest_Monitor	Expect	Avenue	What are your savings objectives?	Reason_Equity	Reason_Mutual	Reason_Bonds	Reason_FD	Source
Output: 1. **Investment Preferences by Gender**: We want to explore if there are differences in investment preferences between genders. A stacked bar chart can effectively show the distribution of investment avenues (Mutual Funds, Equity Market, etc.) by gender. This will help us understand if certain inves...
Insightful (GEval)       : 0.82  |  The output identifies connections such as gender differences in investment preferences and age-related trends, which are not explicitly stated in the input. It also unveils patterns like the relationship between savings objectives and investment avenues, and expected returns with investment preferences, going be

In [12]:
results_human = evaluate(
    test_cases=[test_case_human],
    metrics=[insightful, novelty, depth],
)

# ── pretty‑print ───────────────────────────────
for label, test_case_list in results_human:
    print(f"\n==== Label: {label} ====")

    if test_case_list is None:
        print("⚠️  No results for this label.\n")
        continue

    for test_case in test_case_list:
        print(f"Input:  {test_case.input}")
        print(f"Output: {test_case.actual_output[:300]}...")

        for metric in test_case.metrics_data:
            print(f"{metric.name:<25}: {metric.score:.2f}  |  {metric.reason}")

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.11s/test case]



Metrics Summary

  - ❌ Insightful (GEval) (score: 0.43478745360279164, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output identifies a trend of age-related investment preferences but lacks depth in connecting other dataset elements or providing fresh insights beyond age and gender., error: None)
  - ❌ Novelty (GEval) (score: 0.33098848833644495, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output provides some insights on age and gender trends in investment preferences, but lacks unique perspectives or surprising insights for a domain expert., error: None)
  - ❌ Depth (GEval) (score: 0.3249491955466016, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output mentions age-related investment preferences and gender dependency on returns, but lacks unique perspectives or complex synthesis. It does not identify non-obvious patterns or challenge assumptions innovatively., error: None)

For test case:

  - input: Personal 





==== Label: test_results ====
Input:  Personal finance investment survey dataset with columns: gender	age	Investment_Avenues	Mutual_Funds	Equity_Market	Debentures	Government_Bonds	Fixed_Deposits	PPF	Gold	...	Duration	Invest_Monitor	Expect	Avenue	What are your savings objectives?	Reason_Equity	Reason_Mutual	Reason_Bonds	Reason_FD	Source
Output: We can see from the above graph that most of our respondents were in the ages 24-31 and we will look further their habits in the finance field. The above trend shows us that people are less worried about risks involved and more worried about the money which they will get back. We can see that high n...
Insightful (GEval)       : 0.43  |  The output identifies a trend of age-related investment preferences but lacks depth in connecting other dataset elements or providing fresh insights beyond age and gender.
Novelty (GEval)          : 0.33  |  The output provides some insights on age and gender trends in investment preferences, but lacks unique pe