## Generate test data

In [15]:
import json

prompt = """[Background] Human values are the principles, standards, and qualities considered worthwhile or desirable by people. Schwartz values include self-direction, stimulation, hedonism, achievement, power, security, tradition, conformity, benevolence, and universalism.
[Task] Write a story of over 1000 words and involving multiple characters, each of whom represents unique Schwartz values. The story can manifest the values that the characters support and oppose. After writing the story, conclude the Schwartz values each character supports and opposes.
[Output format] The output follows the below format:
Story: The story text
```json
[
  {
    "name": "Alice",
    "supports": ["self-direction", "hedonism"],
    "opposes": ["conformity"]
  },
  {
    "name": "Bob",
    "supports": ["achievement", "power"],
    "opposes": ["tradition"]
  }
]
```
"""

def parse_response(response):
    # The text between "Story:" and "```json" is the story
    story = response.split("Story:")[1].split("```json")[0].strip()
    # The JSON part is the character values
    values = json.loads(response.split("```json")[1].strip('```').strip())
    return story, values




In [30]:
from gpv.models import OpenAIModel

model_name = "gpt-4o"
model = OpenAIModel(model_name, temperature=1.0, max_new_tokens=4096)

responses2 = model.predict(prompt, kwargs={"n": 10})

---

## Evaluate the results

In [13]:
import json

schwartz_values = ["self-direction", "stimulation", "hedonism", "achievement", "power", "security", "tradition", "conformity", "benevolence", "universalism"]

path_pred = "../outputs/gpt4o_stories_scores.json"
path_gt = "../outputs/gpt4o_stories.json"


with open(path_pred, "r") as f:
    preds = json.load(f)

with open(path_gt, "r") as f:
    gts = json.load(f)

# Lowercase all keys in the predictions
for pred in preds:
    for char in pred:
        new_dict = {}
        for value in pred[char]:
            new_dict[value.lower()] = pred[char][value]
        pred[char] = new_dict

ss = 0 # gt support; pred support
so = 0 # gt support; pred oppose
os = 0 # gt oppose; pred support
oo = 0 # gt oppose; pred oppose
missed = 0 # gt not in pred
missed_chars = 0
for instance, (gt, pred) in enumerate(zip(gts, preds)):
    gt_chars = gt["characters"]
    for gt_char in gt_chars:
        gt_name = gt_char["name"]
        gt_supports = gt_char["supports"]
        gt_opposes = gt_char["opposes"]
        
        if gt_name not in pred:
            missed_chars += 1
            print("missed:", instance, gt_name)
            continue
        for value_support in gt_supports:
            if value_support not in schwartz_values:
                continue
            assert value_support in pred[gt_name]
            pred_valence = pred[gt_name].get(value_support)
            if pred_valence is None:
                missed += 1
                print("missed:", instance, gt_name, value_support)
            elif pred_valence > 0:
                ss += 1
            else:
                so += 1
        for value_oppose in gt_opposes:
            if value_oppose not in schwartz_values:
                continue
            assert value_oppose in pred[gt_name]
            pred_valence = pred[gt_name].get(value_oppose)
            if pred_valence is None:
                missed += 1
                print("missed:", instance, gt_name, value_oppose)
            elif pred_valence > 0:
                os += 1
                print("os:", instance, gt_name, value_oppose)
            else:
                oo += 1

missed: 0 Emma self-direction
os: 0 Raj hedonism
missed: 0 Raj stimulation
missed: 0 Carla self-direction
missed: 0 Carla stimulation
missed: 0 Lia achievement
missed: 0 Lia power
missed: 1 Clara self-direction
missed: 1 David power
missed: 2 Clara stimulation
missed: 2 Daniel tradition
missed: 2 Daniel stimulation
missed: 2 Evelyn power
os: 3 Max stimulation
missed: 3 Hayley security
os: 3 Clara power
missed: 3 Clara hedonism
os: 4 Bob tradition
missed: 4 Clara power
missed: 4 David stimulation
missed: 4 Emma power
missed: 5 Clara power
missed: 5 Miriam security
missed: 5 Miriam stimulation
missed: 6 Claire self-direction
missed: 6 Elena power
missed: 7 Alice hedonism
missed: 7 Bob tradition
missed: 7 Evelyn power
os: 7 George stimulation
missed: 7 David security
missed: 7 Linda stimulation
missed: 7 Sophia power
missed: 8 Greg self-direction
missed: 8 Sophia achievement
missed: 8 James universalism
os: 8 Olivia security
missed: 8 Martha stimulation
missed: 9 Alice hedonism
missed: 9 

In [14]:
ss, so, os, oo, missed, missed_chars

(104, 0, 6, 29, 35, 0)

In [15]:
acc = (ss + oo) / (ss + so + os + oo)
acc

0.9568345323741008

此处将GPT-4o的生成作为ground truth，需要做一些解释：为什么不直接用GPT-4o做测定，而设计GPV?

此处GPT的生成是一个简单任务，作验证目的：
- 封闭价值空间，仅仅涉及施瓦茨价值；更多价值维度会给GPT推理带来较大挑战
- 生成1000单词长度的短文
- 粗粒度的测定，只关注角色**总体上**是支持还是反对一个价值

GPV的优势
- 开放价值空间，可以涉及任意多的价值维度，不会有性能损失
- 任意长度文本
- 细粒度的测定，可以关注角色在不同情境下的perception，针对细粒度perception进行测定并聚合
