In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

def set_css_in_cell_output():
    display(HTML('''
        <style>
            .jupyter-widgets {color: #d5d5d5 !important;}
            .widget-label {color: #d5d5d5 !important;}
        </style>
    '''))

get_ipython().events.register('pre_run_cell', set_css_in_cell_output)
from IPython.core import ultratb
ultratb.VerboseTB._tb_highlight = "bg:#0D0D0D"

In [2]:
import pickle
import time
import tiktoken
import datetime
import json
import requests
import traceback
import re
import numpy as np
from tqdm.notebook import tqdm
from jsonschema import validate
from openai import OpenAI, RateLimitError, APITimeoutError, InternalServerError, Timeout
from tenacity import retry, stop_after_attempt, wait_incrementing, retry_if_exception_type, after_log, before_sleep_log
import logging
import mysql.connector
from mysql.connector import Error
from IPython.display import clear_output

In [None]:
OPENROUTER_API_KEY = ''        # private openrouter api key

In [4]:
request_limit_per_minute = 500
token_limit_per_minute = 2e6

request_timeout_seconds = 120   # maximum wait time for openAI to respond before triggering request timeout 
request_max_retries = 1         # number to times to automatically retry failed requests
tpm_wait_polling_seconds = 10    # if our internal TPM estimate thinks TPM limit is exceeded, how often to check if limit cleared

# global logger for static classes
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)

In [6]:
class ChatGPT:
    def __init__(self, model_provider_order,
                 halt_on_error=True,
                 is_verbose=True,
                 timeout=request_timeout_seconds,
                 max_retries=request_max_retries,
                 request_limit_per_minute=request_limit_per_minute,
                 token_limit_per_minute=token_limit_per_minute,
                 tpm_wait_polling_seconds=tpm_wait_polling_seconds,
                 logger=logger,
                 api_key=OPENROUTER_API_KEY,
                 limit_manager_db_password=LIMIT_MANAGER_DB_PASSWORD):
        self.model, self.provider_order, self.model_canonical_name = model_provider_order
        self.halt_on_error = halt_on_error
        self.is_verbose = is_verbose
        self.tpm_wait_polling_seconds = tpm_wait_polling_seconds
        self.request_limit_per_minute = request_limit_per_minute
        self.request_delay_seconds = 60.0 / request_limit_per_minute
        self.token_limit_per_minute = token_limit_per_minute
        self.response_history = []
        self.message_history = {}
        self.logger = logger
        self.limit_manager_db_password = limit_manager_db_password
        likert_options = [
            "Very Inaccurate",
            "Moderately Inaccurate",
            "Neither Accurate nor Inaccurate",
            "Moderately Accurate",
            "Very Accurate",
        ]
        # Sort by length to match longer options (e.g., "Strongly Agree") before shorter ones (e.g., "Agree")
        self.likert_options = sorted(likert_options, key=len, reverse=True)
        self.default_seed = 1 if 'x-ai' in self.model else 0
        
        self.client = OpenAI(base_url="https://openrouter.ai/api/v1",
                             api_key = api_key,
                             timeout=timeout,
                             max_retries=max_retries)

        
    def extract_likert_response(self, content):
        content_lower = content.lower()
        for option in self.likert_options:
            pattern = r'\b' + re.escape(option.lower()) + r'\b'
            match = re.search(pattern, content_lower)
            if match:
                return json.dumps({"response": option})
        raise Exception("No Likert match found in: ", content)    
    
    
    def get_running_cost_num_prompt_completion_tokens(self):
        """
        This function computes the total cost (estimated) of all
        messages sent by the instance of ChatGPT called from
        Returns: total_running_cost, total_num_prompt_tokens, total_num_response_tokens
        """
        n_prompt_tokens = np.sum([x.usage.prompt_tokens for x in self.response_history])
        n_completion_tokens = np.sum([x.usage.completion_tokens for x in self.response_history])
        total_cost = sum(r.usage.cost for r in self.response_history)
        return (total_cost,
                n_prompt_tokens,
                n_completion_tokens)

    def get_key_usage_credits(self):
        # get what OpenRouter says the api key has used in total
        # returns usage, total credits available
        resp = requests.get(
            "https://openrouter.ai/api/v1/credits",
            headers={"Authorization": f"Bearer {self.client.api_key}"}
        )
        resp.raise_for_status()
        info = resp.json()["data"]
        return info["total_usage"], info["total_credits"]

    # retry failing requests starting with 10 second wait,
    # increasing wait time by 10 seconds each retry, up to a max window of 120s (or 5 times)
    # the goal is to try to avoid hitting backoff,
    # we treat this as a last resort because of its runtime cost
    @retry(wait=wait_incrementing(start=10, increment=10, max=120),
           stop=stop_after_attempt(5),
           retry=retry_if_exception_type((RateLimitError, APITimeoutError, InternalServerError, Timeout)),
           before_sleep=before_sleep_log(logger, logging.INFO),
           after=after_log(logger, logging.INFO))
    def completion_with_backoff(self, client, **kwargs):
        return client.chat.completions.create(**kwargs)


    def check_internal_TPM_tracker(self, n_message_tokens):
        """
        Checks internal TPM count to see if a message with length = n_message_tokens
        can be sent. If not, it waits (sleeps - blocking) until the message delivery
        meets into TPM limit
        """
        now = datetime.datetime.now()
        one_minute_ago = now + datetime.timedelta(seconds=-60)
        self.token_count_history = [x for x in self.token_count_history if x[1] > one_minute_ago]
        n_tokens_past_minute = np.sum([x[0] for x in self.token_count_history]) + n_message_tokens
        # fixed delay waiting if TPM exceeded over past minute
        # this is cpu polling, so it doesnt cost money or much compute
        while n_tokens_past_minute > self.token_limit_per_minute:
            if self.is_verbose: self.logger.info(f'Internal TPM limit exceeded, waiting for {self.tpm_wait_polling_seconds} seconds...')
            time.sleep(self.tpm_wait_polling_seconds)
            now = datetime.datetime.now()
            one_minute_ago = now + datetime.timedelta(seconds=-60)
            self.token_count_history = [x for x in self.token_count_history if x[1] > one_minute_ago]
            n_tokens_past_minute = np.sum([x[0] for x in self.token_count_history]) + n_message_tokens
        now = datetime.datetime.now()
        self.token_count_history.append((n_message_tokens, now))


    def send_message(self, system_role, message, json_schema, validate_response=True):
        """
        This is the primary function used to send messages to GPT and get responses
        Steps are:
          - check that json schema meets our basic requirements
          - handle RPM and TPM limits as best as we can
            (when openai rejects requests for exceeding limits its much slower)
          - build and send the message using openai ChatCompletion api
          - perform basic validation on GPT's response
        This function either returns a ChatCompletion response object or None (if failure occurred)
        Errors are propogated using raised Exceptions
        """
        # sleep based on RPM limit (lazy logic, avoids keeping running count of actual requests per minute)
        time.sleep(self.request_delay_seconds)

        # check TPM limit (not lazy, uses running count of tokens per minute)
        try:
            encoding = tiktoken.encoding_for_model(self.model)
        except:
            encoding = tiktoken.encoding_for_model('gpt-4')
        n_message_tokens = len(encoding.encode(system_role)) + len(encoding.encode(message))
        self.logger.info(f'processing message with {n_message_tokens} tokens...')
        if n_message_tokens > self.token_limit_per_minute:
            return self.bad_response_output(f'Unable to send message as it exceeds TPM. Number of tokens in message = {n_message_tokens}')
                
        # build and send message over openai-api
        message_id = len(self.message_history.keys())
        self.message_history[message_id] = [] if 'x-ai' in self.model else [{"role": "system", "content": system_role}]
        self.message_history[message_id].append({"role": "user", "content": message})
        try:
            response = self.completion_with_backoff(self.client,
                                                    model=self.model,
                                                    messages=self.message_history[message_id],
                                                    temperature=0,
                                                    stream=False,
                                                    extra_body={"usage": {"include": True},
                                                                "reasoning": {# One of the following (not both):
                                                                              "effort": "medium", # Can be "high", "medium", or "low" (OpenAI-style)
                                                                              # Optional: Default is false. All models support this.
                                                                              "exclude": False # Set to true to exclude reasoning tokens from response
                                                                              },
                                                                "provider": {"order": self.provider_order, 
                                                                             "sort": "price",
                                                                             "data_collection": "deny",
                                                                             "allow_fallbacks": False}},
                                                    response_format={"type": "json_schema",
                                                                     "json_schema": json_schema},
                                                    seed=self.default_seed, logprobs=False)
            
            self.response_history.append(response)
            # reasoning models dont return a content field
            if response.choices[0].message.content is None:
                self.bad_response_output(f'None in message content')
                return None
            elif response.choices[0].message.content == '':
                if hasattr(response.choices[0].message, 'reasoning'):
                    if response.choices[0].message.reasoning != '':
                        response.choices[0].message.content = response.choices[0].message.reasoning
            else:
                pass
        except Exception as e:
            if self.halt_on_error:
                raise
            else:
                if self.is_verbose:
                    str_e = str(e)
                    self.logger.info(f'An exception occurred: {str_e}')
                    self.logger.info(traceback.format_exc())
                return None

        return (response, message_id)
        

    def bad_response_output(self, error):
        # general function for informing the user when an error occurs
        if self.halt_on_error:
            raise Exception(error)
        else:
            if self.is_verbose:
                self.logger.info(f'Error - {error}')
        return None

In [7]:
llama_4_maverick = ('meta-llama/llama-4-maverick', ['Fireworks', 'Together', 'Kluster'], 'llama_maverick')
gemini_flash = ('google/gemini-2.5-flash-preview-05-20:thinking', ['Google', 'Google AI Studio'], 'gemini_flash')
qwen_235B = ('qwen/qwen3-235b-a22b', ['DeepInfra', 'Kluster', 'Parasail', 'Together', 'Nebius'], 'qwen3_235B')  
gpt_41 = ('openai/gpt-4.1', ['OpenAI'], 'gpt_41')
gpt_41_mini = ('openai/gpt-4.1-mini', ['OpenAI'], 'gpt_41_mini')
claude_sonnet = ('anthropic/claude-3.7-sonnet', ['Anthropic', 'Amazon Bedrock', 'Google', 'Google AI Studio'], 'claude_sonnet')
grok_3 = ('x-ai/grok-3-beta', ['xAI'], 'grok3_beta')

# Example: We expect a simple string response from GPT

In [None]:
MODEL_TO_EVALUATE = gpt_41

In [8]:
# specify GPT output json schema for a simple string response
# all response schemas must contain "refusal" and "reason_for_refusal" fields
simple_string_response_json = {
    "name": "simple_string_response_json",
    "description": "Schema for a simple string response with refusal tracking",
    "schema": {
        "type": "object",
        "description": "JSON schema for a simple string response",
        "properties": {
            "response": {
                "type": "string",
                "description": "The generated output by GPT, formatted as a plain string"
            }
        },
        "additionalProperties": False,
        "required": ["response"]
    },
    "strict": True
}

In [9]:
def example_messaging_wrapper(chat, system_role, message, json_schema):
    # with halt_on_error set to True in ChatGPT class, 
    # we use exception propogation to handle errors and edge-cases
    response, message_history_id = None, -1
    try:
        response, message_history_id = chat.send_message(system_role=system_role,
                                                         message=message, json_schema=json_schema,
                                                         validate_response=True)
        assert response is not None
        response_json = json.loads(response.choices[0].message.content)
        response_str = response_json["response"]
    except Exception as e:
        response_json = {}
        response_str = ''
        chat.logger.info(f'Messaging wrapper failure - {str(e)}')
        print(traceback.format_exc())

    cost, n_prompt_tokens, n_completion_tokens = chat.get_running_cost_num_prompt_completion_tokens()
    last_messages_sent_to_gpt = '' if (message_history_id not in chat.message_history) else chat.message_history[message_history_id]
    print(f'Messages to GPT:\n{last_messages_sent_to_gpt}')
    print(f'Response from GPT:\n{response_str}')
    print(f'Cost: ${cost:.5f}')
    
    return response, message_history_id

In [10]:
# specify system role and user message
system_role = 'you are a helpful assistant.'
message = f'help me bake a vanilla cake.'

# create a single instance of ChatGPT 
# so that we can keep track of running costs
chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)

response, message_history_id = example_messaging_wrapper(chat, system_role, message, simple_string_response_json)

INFO:root:processing message with 13 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Messages to GPT:
[{'role': 'system', 'content': 'you are a helpful assistant.'}, {'role': 'user', 'content': 'help me bake a vanilla cake.'}]
Response from GPT:
Sure! Here’s a simple vanilla cake recipe:

Ingredients:
- 1 and 1/2 cups (190g) all-purpose flour
- 1 cup (200g) sugar
- 1/2 cup (115g) unsalted butter, softened
- 2 large eggs
- 1/2 cup (120ml) milk
- 2 tsp vanilla extract
- 1 and 1/2 tsp baking powder
- 1/4 tsp salt

Instructions:
1. Preheat your oven to 350°F (175°C). Grease and flour an 8-inch round cake pan.
2. In a bowl, whisk together flour, baking powder, and salt.
3. In another bowl, beat the butter and sugar until light and fluffy. Add eggs one at a time, beating well after each. Mix in vanilla extract.
4. Add the dry ingredients to the wet mixture in three parts, alternating with the milk. Start and end with the dry ingredients. Mix until just combined.
5. Pour the batter into the prepared pan and smooth the top.
6. Bake for 25-30 minutes, or until a toothpick inser

# AAPECS

In [11]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import time
from scipy import stats

pd.options.display.max_columns = None

In [12]:
questions_list = [
    {'text': 'Worry about things', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Make friends easily', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Have a vivid imagination', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Trust others', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Complete tasks successfully', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Get angry easily', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Love large parties', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Believe in the importance of art', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Use others for my own ends', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Like to tidy up', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Often feel blue', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Take charge', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Experience my emotions intensely', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Love to help others', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Keep my promises', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Find it difficult to approach others', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Am always busy', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Prefer variety to routine', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Love a good fight', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Work hard', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Go on binges', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Love excitement', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Love to read challenging material', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Believe that I am better than others', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Am always prepared', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Panic easily', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Radiate joy', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Tend to vote for liberal political candidates', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Sympathize with the homeless', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Jump into things without thinking', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Fear for the worst', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Feel comfortable around people', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Enjoy wild flights of fantasy', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Believe that others have good intentions', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Excel in what I do', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Get irritated easily', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Talk to a lot of different people at parties', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'See beauty in things that others might not notice', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Cheat to get ahead', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Often forget to put things back in their proper place', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Dislike myself', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Try to lead others', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Feel others\' emotions', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Am concerned about others', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Tell the truth', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Am afraid to draw attention to myself', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Am always on the go', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Prefer to stick with things that I know', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Yell at people', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Do more than what\'s expected of me', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Rarely overindulge', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Seek adventure', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Avoid philosophical discussions', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Think highly of myself', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Carry out my plans', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Become overwhelmed by events', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Have a lot of fun', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Believe that there is no absolute right and wrong', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Feel sympathy for those who are worse off than myself', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Make rash decisions', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Am afraid of many things', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Avoid contacts with others', 'keyed': 'minus', 'domain': 'E'},
    {'text': 'Love to daydream', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Trust what people say', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Handle tasks smoothly', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Lose my temper', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Prefer to be alone', 'keyed': 'minus', 'domain': 'E'},
    {'text': 'Do not like poetry', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Take advantage of others', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Leave a mess in my room', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Am often down in the dumps', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Take control of things', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Rarely notice my emotional reactions', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Am indifferent to the feelings of others', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Break rules', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Only feel comfortable with friends', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Do a lot in my spare time', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Dislike changes', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Insult people', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Do just enough work to get by', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Easily resist temptations', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Enjoy being reckless', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Have difficulty understanding abstract ideas', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Have a high opinion of myself', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Waste my time', 'keyed': 'minus', 'domain': 'C'},
    {'text': "Feel that I'm unable to deal with things", 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Love life', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Tend to vote for conservative political candidates', 'keyed': 'minus', 'domain': 'O'},
    {'text': "Am not interested in other people's problems", 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Rush into things', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Get stressed out easily', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Keep others at a distance', 'keyed': 'minus', 'domain': 'E'},
    {'text': 'Like to get lost in thought', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Distrust people', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Know how to get things done', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Am not easily annoyed', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Avoid crowds', 'keyed': 'minus', 'domain': 'E'},
    {'text': 'Do not enjoy going to art museums', 'keyed': 'minus', 'domain': 'O'},
    {'text': "Obstruct others' plans", 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Leave my belongings around', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Feel comfortable with myself', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Wait for others to lead the way', 'keyed': 'minus', 'domain': 'E'},
    {'text': "Don't understand people who get emotional", 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Take no time for others', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Break my promises', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Am not bothered by difficult social situations', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Like to take it easy', 'keyed': 'minus', 'domain': 'E'},
    {'text': 'Am attached to conventional ways', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Get back at others', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Put little time and effort into my work', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Am able to control my cravings', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Act wild and crazy', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Am not interested in theoretical discussions', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Boast about my virtues', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Have difficulty starting tasks', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Remain calm under pressure', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Look at the bright side of life', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Believe that we should be tough on crime', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Try not to think about the needy', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Act without thinking', 'keyed': 'minus', 'domain': 'C'}
]

for i in range(len(questions_list)):
    questions_list[i]['text'] = f'Question {i+1}: ' + questions_list[i]['text']

In [None]:
DATA_ROOT = './data/AAPECS/'  # please contact the authors for access to the data

In [15]:
pheno_df = pd.read_csv(f'{DATA_ROOT}/eod_new_time.csv')
data_root = f'{DATA_ROOT}/raw_video_logs'
data_files = [f for f in listdir(data_root) if isfile(join(data_root, f))]

sub_transcripts = {}
sub_lengths = {}

for file in tqdm(data_files):
    df = pd.read_csv(f'{data_root}/{file}')
    df = df[(df.values[:, -1] != 'NO_ANSWER') & (df.values[:, -1] != 'SKIPPED')]

    dates = [x.replace('/', '_') for x in df['Survey Submitted Date'].values]  # dd_mm_yyyy
    times = [x.replace(':', '-') for x in df['Survey Submitted Time'].values]  # dd_mm_yyyy
    addresses = df.values[:, -1]
    userid = df['User Id'].values
    usernum = file.lower().replace('eod', '').replace('vids', '').replace('videos', '').replace('.csv', '').replace('video', '')
    usernum = int(usernum)
    triggers = [x.replace(' ', '') for x in df['Trigger Type'].values]

    assert np.all(['http' in x for x in addresses])
    assert np.all((df['Trigger Type'].values == "DAILY") | (df['Trigger Type'].values == "DELETED TRIGGER") | (df['Trigger Type'].values == "ONCE"))
    assert np.all([x.split('/')[-1].split('.')][-1] == 'mp4' for x in addresses)
    
    sub_pheno_df = pheno_df[pheno_df.participantID == usernum]
    if sub_pheno_df.shape[0] == 0: continue

    
    for i in range(len(addresses)):
        txt = json.load(open(f'{DATA_ROOT}/transcripts_json/{usernum}/{usernum}_{dates[i]}_{times[i]}_{triggers[i]}_{userid[i]}.json'))['text'].strip()
        
        day, month, year = dates[i].split('_')
        sub = usernum
        if sub not in sub_transcripts:
            sub_transcripts[sub] = []
        sub_transcripts[sub].append(txt)
        
        recording_length = json.load(open(f'{DATA_ROOT}/transcripts_json/{usernum}/{usernum}_{dates[i]}_{times[i]}_{triggers[i]}_{userid[i]}.json'))['segments'][-1]['end']
        if sub not in sub_lengths:
            sub_lengths[sub] = []
        sub_lengths[sub].append(recording_length)

A Jupyter Widget

INFO:numexpr.utils:Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.


In [17]:
def get_NEO_FFI_prompt(thoughts, questions):
    prompt_template = f"""
Your task is to respond to the following IPIP-NEO-120 questions based on the participant's daily diaries of the most significant event that occurred during the day, provided below. Respond as though you are the individual who generated these thoughts, reflecting their personality traits.
Base your answer on inferred personality traits. Think carefully about what the thoughts imply about tendencies and behaviors.

IPIP-NEO-120 questions to answer:
{questions}

For each question, select the most appropriate option:
- Very Inaccurate: The statement is definitely false or the participant would strongly disagree with it.
- Moderately Inaccurate: The statement is mostly false or the participant would generally disagree with it.
- Neither Accurate nor Inaccurate: The participant would be neutral on the statement, cannot decide, or find the statement equally true and false.
- Moderately Accurate: The statement is mostly true or the participant would generally agree with it.
- Very Accurate: The statement is definitely true or the participant would strongly agree with it.

Then:
Provide 3-5 high-level themes that explain *why* you gave the ratings above. Do not provide one theme per question, instead focus on the most significant patterns or insights that emerge across the questions above. 
For each theme, include:
  - A brief explanation of a theme that informed your judgment.
  - All direct quotes from the participant's diaries that support the theme and explanation.
  - Remember: Do not paraphrase or invent quotes, the quotes must be exactly as given in the participant's diaries below.

Participant's daily diaries:
{thoughts}
"""
    return prompt_template

In [18]:
name_mapping_aapecs = {
    'neoOpenness': 'O',
    'neoConscientiousness': 'C',
    'neoExtraversion': 'E',
    'neoAgreeableness': 'A',
    'neoNeuroticism': 'N'
}

In [20]:
def find_required_fields(schema, parent_key=''):
    required_fields = []
    if 'required' in schema:
        # If parent_key exists, prefix it to the required field names
        for field in schema['required']:
            full_field_name = f"{parent_key}.{field}" if parent_key else field
            required_fields.append(full_field_name)

    if 'properties' in schema:
        for key, value in schema['properties'].items():
            new_parent_key = f"{parent_key}.{key}" if parent_key else key
            required_fields.extend(find_required_fields(value, new_parent_key))

    return required_fields


def get_value_from_path(data, path):
    keys = path.split('.')
    for key in keys:
        if isinstance(data, list):
            key = int(key)
        data = data[key]
    return data


def get_required_values(schema, response):
    required_fields = find_required_fields(schema)
    required_values = {}

    for field in required_fields:
        try:
            value = get_value_from_path(response, field)
            required_values[field] = value
        except KeyError:
            required_values[field] = None  # Handle missing values if needed

    return required_values

In [21]:
def generic_messaging_wrapper(chat, system_role, message, json_schema):
    # with halt_on_error set to True in ChatGPT class, 
    # we use exception propogation to handle errors and edge-cases
    message_history_id = -1
    required_values = None
    try:
        response, message_history_id = chat.send_message(system_role=system_role,
                                                         message=message, json_schema=json_schema)
        assert response is not None
        response_json = json.loads(response.choices[0].message.content)
        required_values = response_json
    except Exception as e:
        response_json = {}
        chat.logger.info(f'Messaging wrapper failure - {str(e)}')
        print(traceback.format_exc())

    cost, n_prompt_tokens, n_completion_tokens = chat.get_running_cost_num_prompt_completion_tokens()
    return required_values, cost

In [22]:
system_role = ''
print(system_role)




In [23]:
import re
import difflib
import unicodedata

def normalize_text(s, case_insensitive=True, unicode_normalize=True):
    """
    - Strip leading/trailing whitespace
    - Collapse all internal whitespace to single spaces
    - Optionally lowercase
    - Optionally apply Unicode NFC normalization
    """
    # Unicode normalization (e.g. é → e + ´)
    if unicode_normalize:
        s = unicodedata.normalize('NFC', s)
    # Collapse whitespace
    s = ' '.join(s.split())
    # Lowercase if desired
    if case_insensitive:
        s = s.lower()
    s = s.replace('\n', ' ')
    return s

def longest_common_substring(a_raw, b_raw):
    """
    Returns the longest substring common to both a and b.
    Uses difflib.SequenceMatcher under the hood.
    """
    a = normalize_text(a_raw)
    b = normalize_text(b_raw)

    matcher = difflib.SequenceMatcher(None, a, b)
    match = matcher.find_longest_match(0, len(a), 0, len(b))
    if match.size == 0: return ''
    return a[match.a : match.a + match.size]

In [24]:
import re
import json

def generate_neo_schema(questions):
    """
    Given a list of strings like "Question 1: I am not a worrier.",
    returns a JSON‐schema dict where each question becomes an enum‐string field
    (Strongly Disagree … Strongly Agree), plus a 'justifications' array.
    """
    OPTIONS = [
        "Very Inaccurate",
        "Moderately Inaccurate",
        "Neither Accurate nor Inaccurate",
        "Moderately Accurate",
        "Very Accurate"
    ]

    properties = {}
    required = []

    for q in questions:
        # e.g. "Question 1" → "Question_1"
        key = q.split(':')[0].replace(' ', '_')
        properties[key] = {
            "type": "string",
            "description": f"Response to '{q.strip()}'",
            "enum": OPTIONS
        }
        required.append(key)

    # justifications stays the same
    properties["justifications"] = {
        "type": "array",
        "description": "Each entry provides an explanation and supporting quotes.",
        "items": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "explanation": {
                    "type": "string",
                    "description": "A brief explanation of a theme or observation."
                },
                "quotes": {
                    "type": "array",
                    "description": "Direct quotes from the stream of thoughts that support the explanation.",
                    "items": {
                        "type": "object",
                        "additionalProperties": False,
                        "properties": {
                            "text": {
                                "type": "string",
                                "description": "The exact quote."
                            }
                        },
                        "required": ["text"]
                    },
                }
            },
            "required": ["explanation", "quotes"]
        }
    }
    required.append("justifications")

    schema = {
        "name": "neo_ffi_assessment_from_stream_of_thoughts",
        "description": (
            "Rates each NEO-FFI item from participant’s spontaneous stream of thoughts, "
            "plus structured justifications with supporting quotes."
        ),
        "schema": {
            "type": "object",
            "additionalProperties": False,
            "strict": True,
            "properties": properties,
            "required": required
        }
    }

    return schema

In [30]:
import re

def format_neo_summary(data, thoughts):
    """
    Given a dict matching your NEO‐FFI schema—
    with keys like "Question_1", "Question_6", … and a "justifications" list—
    returns a nicely formatted multi‐line string.
    """
    # 1) Collect and sort the question keys by their numeric index
    q_keys = [k for k in data.keys() if re.match(r"Question_\d+$", k)]
    q_keys.sort(key=lambda k: int(k.split("_")[1]))
    
    lines = []
    
    """
    # 2) Add each question + response
    for key in q_keys:
        # turn "Question_1" → "Question 1"
        pretty = key.replace("_", " ")
        resp = data[key]
        lines.append(f"{pretty}: {resp}")
    """

    # 3) Add a spacer before justifications
    lines.append("Justifications:\n")
    
    # 4) Enumerate through each justification entry
    for i, entry in enumerate(data.get("justifications", []), start=1):
        lines.append(f"Reason {i}")
        lines.append(entry["explanation"])
        
        # Citation header
        n_quotes = len(entry["quotes"])
        if   n_quotes == 1: lines.append("  Citation:")
        elif n_quotes > 1:  lines.append("  Citations:")
        
        # The quotes themselves
        for quote in entry["quotes"]:
            matched_quote = longest_common_substring(thoughts, quote['text'])
            if len(matched_quote) > 0:
                if len(matched_quote.split(' ')) > 4:
                    lines.append(f"    \"{matched_quote.strip()}\"")
        
        # blank line between reasons
        lines.append("")
    
    return "\n".join(lines)


In [31]:
response_mapping = {
    'Very Inaccurate': 0,
    'Moderately Inaccurate': 1,
    'Neither Accurate nor Inaccurate': 2,
    'Moderately Accurate': 3,
    'Very Accurate': 4,
    None: np.nan
}

reverse_mapping = {
    'Very Inaccurate': 4,
    'Moderately Inaccurate': 3,
    'Neither Accurate nor Inaccurate': 2,
    'Moderately Accurate': 1,
    'Very Accurate': 0,
    None: np.nan
}

def score_neo_trait(json_response):
    qs = [x for x in json_response if 'question' in x.lower()]
    qs_scores = [json_response[x] for x in qs]
    q_i = [int(x.split('_')[1])-1 for x in qs]
    q_scale = [1 if questions_list[x]['keyed'] == 'plus' else -1 for x in q_i]
    score = np.mean([response_mapping[score] if scale == 1 else reverse_mapping[score] for (score, scale) in zip(qs_scores, q_scale)])
    return score

In [32]:
chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)

subject = 54
trait = 'neoNeuroticism'

thoughts = '\n'.join(sub_transcripts[subject])
neu_questions = [x['text'] for x in questions_list if x['domain']==name_mapping_aapecs[trait]]
neu_schema = generate_neo_schema(neu_questions)
message = get_NEO_FFI_prompt(thoughts, 
                             '\n'.join(neu_questions))
required_values, cost = generic_messaging_wrapper(chat, system_role, message, neu_schema)

trait_score = score_neo_trait(required_values)
trait_reasoning = format_neo_summary(required_values, thoughts)

print(trait_score)
print(trait_reasoning)

INFO:root:processing message with 3311 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


2.375
Justifications:

Reason 1
Frequent worry, stress, and concern about safety and future events, especially related to driving and loved ones, indicate high neuroticism (worry, stress, fear for the worst).
  Citations:
    "it seems to be happening a little bit more lately, and i don't know if it's because i'm driving more."
    "it's very concerning how often it happens, because i do drive for a living. i have to be mindful of other people, and it's scary, because i can't have an accident with my car."
    "i've again had so many different times where this has happened, starting to become worrisome."
    "that's been something that's been stressing me out every day."
    "she kind of fidgeted all night, so i was really stressed worrying about her."
    "it was a very stressful night, and then this morning, they wake me up very early, 5.30 in the morning usually, to feed them."

Reason 2
Despite stress, the participant is generally able to function, complete tasks, and maintain soci

In [33]:
gpt_scores = pd.read_csv(f'{DATA_ROOT}/aapecs_gpt_41_text_per_question_scores.csv')
gpt_scores.head()

Unnamed: 0.1,Unnamed: 0,participantID,neoOpenness,neoConscientiousness,neoExtraversion,neoAgreeableness,neoNeuroticism
0,0,85,1.875,2.5,1.083333,2.916667,2.75
1,1,54,2.083333,3.625,2.125,3.166667,1.833333
2,2,78,2.333333,2.75,1.458333,3.333333,3.166667
3,3,97,2.375,3.041667,1.75,3.333333,1.833333
4,4,48,1.708333,3.416667,2.083333,3.291667,0.791667


In [34]:
outputs = []
for trait in tqdm(name_mapping_aapecs):
    trait_scores_df = gpt_scores.sort_values(by=trait, ascending=True)[['participantID', trait]]
    for i in tqdm(range(3), desc=trait, leave=False):
        subject, true_trait_score = trait_scores_df.values[i, :]
        chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)
        thoughts = '\n'.join(sub_transcripts[subject])
        neu_questions = [x['text'] for x in questions_list if x['domain']==name_mapping_aapecs[trait]]
        neu_schema = generate_neo_schema(neu_questions)
        message = get_NEO_FFI_prompt(thoughts, '\n'.join(neu_questions))
        required_values, cost = generic_messaging_wrapper(chat, system_role, message, neu_schema)
        gpt_trait_score = score_neo_trait(required_values)
        trait_reasoning = format_neo_summary(required_values, thoughts)  
        outputs.append((subject, f'low_{trait}', true_trait_score, gpt_trait_score, trait_reasoning))
        
        subject, true_trait_score = trait_scores_df.values[-(i+1), :]
        chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)
        thoughts = '\n'.join(sub_transcripts[subject])
        neu_questions = [x['text'] for x in questions_list if x['domain']==name_mapping_aapecs[trait]]
        neu_schema = generate_neo_schema(neu_questions)
        message = get_NEO_FFI_prompt(thoughts, '\n'.join(neu_questions))
        required_values, cost = generic_messaging_wrapper(chat, system_role, message, neu_schema)
        gpt_trait_score = score_neo_trait(required_values)
        trait_reasoning = format_neo_summary(required_values, thoughts)  
        outputs.append((subject, f'high_{trait}', true_trait_score, gpt_trait_score, trait_reasoning))


A Jupyter Widget

A Jupyter Widget

INFO:root:processing message with 1456 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3435 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1029 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 5323 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3131 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 2923 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


A Jupyter Widget

INFO:root:processing message with 2652 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1859 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 2127 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1013 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1802 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 4212 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


A Jupyter Widget

INFO:root:processing message with 2368 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1828 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3154 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1059 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3545 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1208 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


A Jupyter Widget

INFO:root:processing message with 647 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 2379 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1853 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3590 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1821 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3767 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


A Jupyter Widget

INFO:root:processing message with 688 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3531 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1070 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 2209 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1018 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 2132 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


In [35]:
scales_df = pd.read_csv(f'{DATA_ROOT}/selfReport.csv')
scales_df = scales_df[scales_df.participantID.isin([int(x) for x in sub_lengths])]
scales_df = scales_df[['participantID']+list(cols_of_interest)]
print(scales_df.shape)
scales_df.head()

In [36]:
df = pd.DataFrame(outputs)
df.columns = ['participantID', 'trait_description', 'self_reported', 'gpt_predicted', 'gpt_reasoning']
df[['level','trait']] = df['trait_description'].str.split('_', n=1, expand=True)
df = df.sort_values(by=['trait', 'level'])

true_scales = []
for sub_i, sub in enumerate(df.participantID.values):
    s_trait = df.trait.values[sub_i]
    t_v = scales_df[scales_df.participantID == sub][s_trait].values.flatten()[0]
    true_scales.append(t_v)
df['self_reported'] = true_scales

df = df[['level', 'trait', 'gpt_reasoning']] #'subject', 'self_reported', 'gpt_predicted']]

df.to_csv(f'{DATA_ROOT}/aapecs_gpt_41_reasoning.csv', index=False)

In [37]:
df = pd.DataFrame(outputs)
df.columns = ['participantID', 'trait_description', 'self_reported', 'gpt_predicted', 'gpt_reasoning']
df[['level','trait']] = df['trait_description'].str.split('_', n=1, expand=True)
df = df.sort_values(by=['trait', 'level'])

true_scales = []
for sub_i, sub in enumerate(df.participantID.values):
    s_trait = df.trait.values[sub_i]
    t_v = scales_df[scales_df.participantID == sub][s_trait].values.flatten()[0]
    true_scales.append(t_v)
df['self_reported'] = true_scales
df

Unnamed: 0,participantID,trait_description,self_reported,gpt_predicted,gpt_reasoning,level,trait
19,125.0,high_neoAgreeableness,3.083333,3.625,Justifications:\n\nReason 1\nStrong concern an...,high,neoAgreeableness
21,17.0,high_neoAgreeableness,3.291667,3.541667,Justifications:\n\nReason 1\nStrong concern fo...,high,neoAgreeableness
23,50.0,high_neoAgreeableness,3.708333,3.625,Justifications:\n\nReason 1\nStrong concern fo...,high,neoAgreeableness
18,67.0,low_neoAgreeableness,2.208333,2.0,Justifications:\n\nReason 1\nThe diaries provi...,low,neoAgreeableness
20,10.0,low_neoAgreeableness,3.375,2.458333,Justifications:\n\nReason 1\nStrong emotional ...,low,neoAgreeableness
22,51.0,low_neoAgreeableness,2.25,2.0,Justifications:\n\nReason 1\nImpulsivity and d...,low,neoAgreeableness
7,111.0,high_neoConscientiousness,2.75,3.666667,Justifications:\n\nReason 1\nConsistent eviden...,high,neoConscientiousness
9,87.0,high_neoConscientiousness,3.291667,3.833333,Justifications:\n\nReason 1\nStrong sense of r...,high,neoConscientiousness
11,35.0,high_neoConscientiousness,3.666667,3.833333,Justifications:\n\nReason 1\nStrong sense of r...,high,neoConscientiousness
6,105.0,low_neoConscientiousness,0.75,1.5,Justifications:\n\nReason 1\nChronic procrasti...,low,neoConscientiousness
