In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

def set_css_in_cell_output():
    display(HTML('''
        <style>
            .jupyter-widgets {color: #d5d5d5 !important;}
            .widget-label {color: #d5d5d5 !important;}
        </style>
    '''))

get_ipython().events.register('pre_run_cell', set_css_in_cell_output)
from IPython.core import ultratb
ultratb.VerboseTB._tb_highlight = "bg:#0D0D0D"

In [2]:
import pickle
import time
import tiktoken
import datetime
import json
import requests
import traceback
import re
import numpy as np
from tqdm.notebook import tqdm
from jsonschema import validate
from openai import OpenAI, RateLimitError, APITimeoutError, InternalServerError, Timeout
from tenacity import retry, stop_after_attempt, wait_incrementing, retry_if_exception_type, after_log, before_sleep_log
import logging
import mysql.connector
from mysql.connector import Error
from IPython.display import clear_output

In [None]:
OPENROUTER_API_KEY = ''        # private openrouter api key

In [4]:
request_limit_per_minute = 500
token_limit_per_minute = 2e6

request_timeout_seconds = 120   # maximum wait time for openAI to respond before triggering request timeout 
request_max_retries = 1         # number to times to automatically retry failed requests
tpm_wait_polling_seconds = 10    # if our internal TPM estimate thinks TPM limit is exceeded, how often to check if limit cleared

# global logger for static classes
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)

In [6]:
class ChatGPT:
    def __init__(self, model_provider_order,
                 halt_on_error=True,
                 is_verbose=True,
                 timeout=request_timeout_seconds,
                 max_retries=request_max_retries,
                 request_limit_per_minute=request_limit_per_minute,
                 token_limit_per_minute=token_limit_per_minute,
                 tpm_wait_polling_seconds=tpm_wait_polling_seconds,
                 logger=logger,
                 api_key=OPENROUTER_API_KEY,
                 limit_manager_db_password=LIMIT_MANAGER_DB_PASSWORD):
        self.model, self.provider_order, self.model_canonical_name = model_provider_order
        self.halt_on_error = halt_on_error
        self.is_verbose = is_verbose
        self.tpm_wait_polling_seconds = tpm_wait_polling_seconds
        self.request_limit_per_minute = request_limit_per_minute
        self.request_delay_seconds = 60.0 / request_limit_per_minute
        self.token_limit_per_minute = token_limit_per_minute
        self.response_history = []
        self.message_history = {}
        self.logger = logger
        self.limit_manager_db_password = limit_manager_db_password
        likert_options = [
            "Very Inaccurate",
            "Moderately Inaccurate",
            "Neither Accurate nor Inaccurate",
            "Moderately Accurate",
            "Very Accurate",
        ]
        # Sort by length to match longer options (e.g., "Strongly Agree") before shorter ones (e.g., "Agree")
        self.likert_options = sorted(likert_options, key=len, reverse=True)
        self.default_seed = 1 if 'x-ai' in self.model else 0
        
        self.client = OpenAI(base_url="https://openrouter.ai/api/v1",
                             api_key = api_key,
                             timeout=timeout,
                             max_retries=max_retries)

        
    def extract_likert_response(self, content):
        content_lower = content.lower()
        for option in self.likert_options:
            pattern = r'\b' + re.escape(option.lower()) + r'\b'
            match = re.search(pattern, content_lower)
            if match:
                return json.dumps({"response": option})
        raise Exception("No Likert match found in: ", content)    
    
    
    def get_running_cost_num_prompt_completion_tokens(self):
        """
        This function computes the total cost (estimated) of all
        messages sent by the instance of ChatGPT called from
        Returns: total_running_cost, total_num_prompt_tokens, total_num_response_tokens
        """
        n_prompt_tokens = np.sum([x.usage.prompt_tokens for x in self.response_history])
        n_completion_tokens = np.sum([x.usage.completion_tokens for x in self.response_history])
        total_cost = sum(r.usage.cost for r in self.response_history)
        return (total_cost,
                n_prompt_tokens,
                n_completion_tokens)

    def get_key_usage_credits(self):
        # get what OpenRouter says the api key has used in total
        # returns usage, total credits available
        resp = requests.get(
            "https://openrouter.ai/api/v1/credits",
            headers={"Authorization": f"Bearer {self.client.api_key}"}
        )
        resp.raise_for_status()
        info = resp.json()["data"]
        return info["total_usage"], info["total_credits"]

    # retry failing requests starting with 10 second wait,
    # increasing wait time by 10 seconds each retry, up to a max window of 120s (or 5 times)
    # the goal is to try to avoid hitting backoff,
    # we treat this as a last resort because of its runtime cost
    @retry(wait=wait_incrementing(start=10, increment=10, max=120),
           stop=stop_after_attempt(5),
           retry=retry_if_exception_type((RateLimitError, APITimeoutError, InternalServerError, Timeout)),
           before_sleep=before_sleep_log(logger, logging.INFO),
           after=after_log(logger, logging.INFO))
    def completion_with_backoff(self, client, **kwargs):
        return client.chat.completions.create(**kwargs)


    def check_internal_TPM_tracker(self, n_message_tokens):
        """
        Checks internal TPM count to see if a message with length = n_message_tokens
        can be sent. If not, it waits (sleeps - blocking) until the message delivery
        meets into TPM limit
        """
        now = datetime.datetime.now()
        one_minute_ago = now + datetime.timedelta(seconds=-60)
        self.token_count_history = [x for x in self.token_count_history if x[1] > one_minute_ago]
        n_tokens_past_minute = np.sum([x[0] for x in self.token_count_history]) + n_message_tokens
        # fixed delay waiting if TPM exceeded over past minute
        # this is cpu polling, so it doesnt cost money or much compute
        while n_tokens_past_minute > self.token_limit_per_minute:
            if self.is_verbose: self.logger.info(f'Internal TPM limit exceeded, waiting for {self.tpm_wait_polling_seconds} seconds...')
            time.sleep(self.tpm_wait_polling_seconds)
            now = datetime.datetime.now()
            one_minute_ago = now + datetime.timedelta(seconds=-60)
            self.token_count_history = [x for x in self.token_count_history if x[1] > one_minute_ago]
            n_tokens_past_minute = np.sum([x[0] for x in self.token_count_history]) + n_message_tokens
        now = datetime.datetime.now()
        self.token_count_history.append((n_message_tokens, now))


    def send_message(self, system_role, message, validate_response=True):
        """
        This is the primary function used to send messages to GPT and get responses
        Steps are:
          - check that json schema meets our basic requirements
          - handle RPM and TPM limits as best as we can
            (when openai rejects requests for exceeding limits its much slower)
          - build and send the message using openai ChatCompletion api
          - perform basic validation on GPT's response
        This function either returns a ChatCompletion response object or None (if failure occurred)
        Errors are propogated using raised Exceptions
        """
        # sleep based on RPM limit (lazy logic, avoids keeping running count of actual requests per minute)
        time.sleep(self.request_delay_seconds)

        # check TPM limit (not lazy, uses running count of tokens per minute)
        try:
            encoding = tiktoken.encoding_for_model(self.model)
        except:
            encoding = tiktoken.encoding_for_model('gpt-4')
        n_message_tokens = len(encoding.encode(system_role)) + len(encoding.encode(message))
        self.logger.info(f'processing message with {n_message_tokens} tokens...')
        if n_message_tokens > self.token_limit_per_minute:
            return self.bad_response_output(f'Unable to send message as it exceeds TPM. Number of tokens in message = {n_message_tokens}')
        
        # build and send message over openai-api
        message_id = len(self.message_history.keys())
        self.message_history[message_id] = [] if 'x-ai' in self.model else [{"role": "system", "content": system_role}]
        self.message_history[message_id].append({"role": "user", "content": message})
        try:
            response = self.completion_with_backoff(self.client,
                                                    model=self.model,
                                                    messages=self.message_history[message_id],
                                                    temperature=0,
                                                    stream=False,
                                                    extra_body={"usage": {"include": True},
                                                                "reasoning": {# One of the following (not both):
                                                                              "effort": "medium", # Can be "high", "medium", or "low" (OpenAI-style)
                                                                              # Optional: Default is false. All models support this.
                                                                              "exclude": False # Set to true to exclude reasoning tokens from response
                                                                              },
                                                                "provider": {"order": self.provider_order, 
                                                                             "sort": "price",
                                                                             "data_collection": "deny",
                                                                             "allow_fallbacks": False}},
                                                    seed=self.default_seed, logprobs=False)
            
            self.response_history.append(response)
            # reasoning models dont return a content field
            if response.choices[0].message.content is None:
                self.bad_response_output(f'None in message content')
                return None
            elif response.choices[0].message.content == '':
                if hasattr(response.choices[0].message, 'reasoning'):
                    if response.choices[0].message.reasoning != '':
                        response.choices[0].message.content = response.choices[0].message.reasoning
            else:
                pass
            try:
                likert_response = self.extract_likert_response(response.choices[0].message.content.strip().lower())
                response.choices[0].message.content = likert_response
            except:
                self.bad_response_output(f'GPT response didnt match likert options')
                return None
        except Exception as e:
            if self.halt_on_error:
                raise
            else:
                if self.is_verbose:
                    str_e = str(e)
                    self.logger.info(f'An exception occurred: {str_e}')
                    self.logger.info(traceback.format_exc())
                return None

        return (response, message_id)
        

    def bad_response_output(self, error):
        # general function for informing the user when an error occurs
        if self.halt_on_error:
            raise Exception(error)
        else:
            if self.is_verbose:
                self.logger.info(f'Error - {error}')
        return None

In [7]:
llama_4_maverick = ('meta-llama/llama-4-maverick', ['Fireworks', 'Together', 'Kluster'], 'llama_maverick')
gemini_flash = ('google/gemini-2.5-flash-preview-05-20:thinking', ['Google', 'Google AI Studio'], 'gemini_flash')
qwen_235B = ('qwen/qwen3-235b-a22b', ['DeepInfra', 'Kluster', 'Parasail', 'Together', 'Nebius'], 'qwen3_235B')  
gpt_41 = ('openai/gpt-4.1', ['OpenAI'], 'gpt_41')
gpt_41_mini = ('openai/gpt-4.1-mini', ['OpenAI'], 'gpt_41_mini')
claude_sonnet = ('anthropic/claude-3.7-sonnet', ['Anthropic', 'Amazon Bedrock', 'Google', 'Google AI Studio'], 'claude_sonnet')
grok_3 = ('x-ai/grok-3-beta', ['xAI'], 'grok3_beta')

# Example: We expect a simple string response from GPT

In [None]:
MODEL_TO_EVALUATE = gemini_flash

In [8]:
def example_messaging_wrapper(chat, system_role, message):
    # with halt_on_error set to True in ChatGPT class, 
    # we use exception propogation to handle errors and edge-cases
    response, message_history_id = None, -1
    try:
        response, message_history_id = chat.send_message(system_role=system_role,
                                                         message=message, 
                                                         validate_response=True)
        assert response is not None
        response_json = json.loads(response.choices[0].message.content)
        response_str = response_json["response"]
    except Exception as e:
        response_json = {}
        response_str = ''
        chat.logger.info(f'Messaging wrapper failure - {str(e)}')
        print(traceback.format_exc())

    cost, n_prompt_tokens, n_completion_tokens = chat.get_running_cost_num_prompt_completion_tokens()
    last_messages_sent_to_gpt = '' if (message_history_id not in chat.message_history) else chat.message_history[message_history_id]
    print(f'Messages to GPT:\n{last_messages_sent_to_gpt}')
    print(f'Response from GPT:\n{response_str}')
    print(f'Cost: ${cost:.5f}')
    
    return response, message_history_id

In [9]:
# specify system role and user message
system_role = 'you are a helpful assistant.'
likert_options = [
    "Strongly Disagree",
    "Strongly Agree",
    "Disagree",
    "Neutral",
    "Agree",
]
likert_options_str = '\n'.join(likert_options)

message = """
Simulate your response to the prompt below without being provided the question and thoughts. This is for testing purposes.

---

Your task is to respond to the following IPIP-NEO-120 question based on the participant's daily diaries of the most significant event that occurred during the day, provided below. Respond as though you are the individual who generated these thoughts, reflecting their personality traits.
Base your answer on inferred personality traits. Think carefully about what the thoughts imply about tendencies and behaviors.

For each question, select the most appropriate option:
- Very Inaccurate: The statement is definitely false or the participant would strongly disagree with it.
- Moderately Inaccurate: The statement is mostly false or the participant would generally disagree with it.
- Neither Accurate nor Inaccurate: The participant would be neutral on the statement, cannot decide, or find the statement equally true and false.
- Moderately Accurate: The statement is mostly true or the participant would generally agree with it.
- Very Accurate: The statement is definitely true or the participant would strongly agree with it.

IPIP-NEO-120 question to answer:
{question}

Participant's daily diaries:
{thoughts}

Your response must be exactly one of:
Very Inaccurate
Moderately Inaccurate
Neither Accurate nor Inaccurate
Moderately Accurate
Very Accurate

Do not include any explanation, punctuation, or additional text. Return only the exact phrase from the list above.
"""

# create a single instance of ChatGPT 
# so that we can keep track of running costs
chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)

response, message_history_id = example_messaging_wrapper(chat, system_role, message)

INFO:root:processing message with 307 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Messages to GPT:
[{'role': 'system', 'content': 'you are a helpful assistant.'}, {'role': 'user', 'content': "\nSimulate your response to the prompt below without being provided the question and thoughts. This is for testing purposes.\n\n---\n\nYour task is to respond to the following IPIP-NEO-120 question based on the participant's daily diaries of the most significant event that occurred during the day, provided below. Respond as though you are the individual who generated these thoughts, reflecting their personality traits.\nBase your answer on inferred personality traits. Think carefully about what the thoughts imply about tendencies and behaviors.\n\nFor each question, select the most appropriate option:\n- Very Inaccurate: The statement is definitely false or the participant would strongly disagree with it.\n- Moderately Inaccurate: The statement is mostly false or the participant would generally disagree with it.\n- Neither Accurate nor Inaccurate: The participant would be neutr

# AAPECS

In [4]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import time
from scipy import stats

pd.options.display.max_columns = None

In [5]:
questions_list = [
    {'text': 'Worry about things', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Make friends easily', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Have a vivid imagination', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Trust others', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Complete tasks successfully', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Get angry easily', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Love large parties', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Believe in the importance of art', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Use others for my own ends', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Like to tidy up', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Often feel blue', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Take charge', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Experience my emotions intensely', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Love to help others', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Keep my promises', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Find it difficult to approach others', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Am always busy', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Prefer variety to routine', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Love a good fight', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Work hard', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Go on binges', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Love excitement', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Love to read challenging material', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Believe that I am better than others', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Am always prepared', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Panic easily', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Radiate joy', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Tend to vote for liberal political candidates', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Sympathize with the homeless', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Jump into things without thinking', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Fear for the worst', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Feel comfortable around people', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Enjoy wild flights of fantasy', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Believe that others have good intentions', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Excel in what I do', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Get irritated easily', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Talk to a lot of different people at parties', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'See beauty in things that others might not notice', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Cheat to get ahead', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Often forget to put things back in their proper place', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Dislike myself', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Try to lead others', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Feel others\' emotions', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Am concerned about others', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Tell the truth', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Am afraid to draw attention to myself', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Am always on the go', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Prefer to stick with things that I know', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Yell at people', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Do more than what\'s expected of me', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Rarely overindulge', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Seek adventure', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Avoid philosophical discussions', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Think highly of myself', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Carry out my plans', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Become overwhelmed by events', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Have a lot of fun', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Believe that there is no absolute right and wrong', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Feel sympathy for those who are worse off than myself', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Make rash decisions', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Am afraid of many things', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Avoid contacts with others', 'keyed': 'minus', 'domain': 'E'},
    {'text': 'Love to daydream', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Trust what people say', 'keyed': 'plus', 'domain': 'A'},
    {'text': 'Handle tasks smoothly', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Lose my temper', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Prefer to be alone', 'keyed': 'minus', 'domain': 'E'},
    {'text': 'Do not like poetry', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Take advantage of others', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Leave a mess in my room', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Am often down in the dumps', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Take control of things', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Rarely notice my emotional reactions', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Am indifferent to the feelings of others', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Break rules', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Only feel comfortable with friends', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Do a lot in my spare time', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Dislike changes', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Insult people', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Do just enough work to get by', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Easily resist temptations', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Enjoy being reckless', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Have difficulty understanding abstract ideas', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Have a high opinion of myself', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Waste my time', 'keyed': 'minus', 'domain': 'C'},
    {'text': "Feel that I'm unable to deal with things", 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Love life', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Tend to vote for conservative political candidates', 'keyed': 'minus', 'domain': 'O'},
    {'text': "Am not interested in other people's problems", 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Rush into things', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Get stressed out easily', 'keyed': 'plus', 'domain': 'N'},
    {'text': 'Keep others at a distance', 'keyed': 'minus', 'domain': 'E'},
    {'text': 'Like to get lost in thought', 'keyed': 'plus', 'domain': 'O'},
    {'text': 'Distrust people', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Know how to get things done', 'keyed': 'plus', 'domain': 'C'},
    {'text': 'Am not easily annoyed', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Avoid crowds', 'keyed': 'minus', 'domain': 'E'},
    {'text': 'Do not enjoy going to art museums', 'keyed': 'minus', 'domain': 'O'},
    {'text': "Obstruct others' plans", 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Leave my belongings around', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Feel comfortable with myself', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Wait for others to lead the way', 'keyed': 'minus', 'domain': 'E'},
    {'text': "Don't understand people who get emotional", 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Take no time for others', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Break my promises', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Am not bothered by difficult social situations', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Like to take it easy', 'keyed': 'minus', 'domain': 'E'},
    {'text': 'Am attached to conventional ways', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Get back at others', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Put little time and effort into my work', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Am able to control my cravings', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Act wild and crazy', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Am not interested in theoretical discussions', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Boast about my virtues', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Have difficulty starting tasks', 'keyed': 'minus', 'domain': 'C'},
    {'text': 'Remain calm under pressure', 'keyed': 'minus', 'domain': 'N'},
    {'text': 'Look at the bright side of life', 'keyed': 'plus', 'domain': 'E'},
    {'text': 'Believe that we should be tough on crime', 'keyed': 'minus', 'domain': 'O'},
    {'text': 'Try not to think about the needy', 'keyed': 'minus', 'domain': 'A'},
    {'text': 'Act without thinking', 'keyed': 'minus', 'domain': 'C'}
]

for i in range(len(questions_list)):
    questions_list[i]['text'] = f'Question {i+1}: ' + questions_list[i]['text']

In [None]:
DATA_ROOT = './data/AAPECS/'  # please contact the authors for access to the data

In [42]:
pheno_df = pd.read_csv(f'{DATA_ROOT}/eod_new_time.csv')
data_root = f'{DATA_ROOT}/raw_video_logs'
data_files = [f for f in listdir(data_root) if isfile(join(data_root, f))]

sub_transcripts = {}
sub_lengths = {}

for file in tqdm(data_files):
    df = pd.read_csv(f'{data_root}/{file}')
    df = df[(df.values[:, -1] != 'NO_ANSWER') & (df.values[:, -1] != 'SKIPPED')]

    dates = [x.replace('/', '_') for x in df['Survey Submitted Date'].values]  # dd_mm_yyyy
    times = [x.replace(':', '-') for x in df['Survey Submitted Time'].values]  # dd_mm_yyyy
    addresses = df.values[:, -1]
    userid = df['User Id'].values
    usernum = file.lower().replace('eod', '').replace('vids', '').replace('videos', '').replace('.csv', '').replace('video', '')
    usernum = int(usernum)
    triggers = [x.replace(' ', '') for x in df['Trigger Type'].values]

    assert np.all(['http' in x for x in addresses])
    assert np.all((df['Trigger Type'].values == "DAILY") | (df['Trigger Type'].values == "DELETED TRIGGER") | (df['Trigger Type'].values == "ONCE"))
    assert np.all([x.split('/')[-1].split('.')][-1] == 'mp4' for x in addresses)
    
    sub_pheno_df = pheno_df[pheno_df.participantID == usernum]
    if sub_pheno_df.shape[0] == 0: continue

    
    for i in range(len(addresses)):
        txt = json.load(open(f'{DATA_ROOT}/transcripts_json/{usernum}/{usernum}_{dates[i]}_{times[i]}_{triggers[i]}_{userid[i]}.json'))['text'].strip()
        
        day, month, year = dates[i].split('_')
        sub = usernum
        if sub not in sub_transcripts:
            sub_transcripts[sub] = []
        sub_transcripts[sub].append(txt)
        
        recording_length = json.load(open(f'{DATA_ROOT}/transcripts_json/{usernum}/{usernum}_{dates[i]}_{times[i]}_{triggers[i]}_{userid[i]}.json'))['segments'][-1]['end']
        if sub not in sub_lengths:
            sub_lengths[sub] = []
        sub_lengths[sub].append(recording_length)

A Jupyter Widget

In [18]:
def get_NEO_FFI_prompt(thoughts, question):
    prompt_template = f"""
Your task is to respond to the following IPIP-NEO-120 question based on the participant's daily diaries of the most significant event that occurred during the day, provided below. Respond as though you are the individual who generated these thoughts, reflecting their personality traits.
Base your answer on inferred personality traits. Think carefully about what the thoughts imply about tendencies and behaviors.

For each question, select the most appropriate option:
- Very Inaccurate: The statement is definitely false or the participant would strongly disagree with it.
- Moderately Inaccurate: The statement is mostly false or the participant would generally disagree with it.
- Neither Accurate nor Inaccurate: The participant would be neutral on the statement, cannot decide, or find the statement equally true and false.
- Moderately Accurate: The statement is mostly true or the participant would generally agree with it.
- Very Accurate: The statement is definitely true or the participant would strongly agree with it.

IPIP-NEO-120 question to answer:
{question}

Participant's daily diaries:
{thoughts}

Your response must be exactly one of:
Very Inaccurate
Moderately Inaccurate
Neither Accurate nor Inaccurate
Moderately Accurate
Very Accurate

Do not include any explanation, punctuation, or additional text. Return only the exact phrase from the list above.
"""
    return prompt_template

In [19]:
def generic_messaging_wrapper(chat, system_role, message):
    # with halt_on_error set to True in ChatGPT class, 
    # we use exception propogation to handle errors and edge-cases
    message_history_id = -1
    required_values = None
    try:
        response, message_history_id = chat.send_message(system_role=system_role,
                                                         message=message)
        assert response is not None
        response_json = json.loads(response.choices[0].message.content)
        required_values = response_json['response']
    except Exception as e:
        response_json = {}
        chat.logger.info(f'Messaging wrapper failure - {str(e)}')
        print(traceback.format_exc())

    cost, n_prompt_tokens, n_completion_tokens = chat.get_running_cost_num_prompt_completion_tokens()
    return required_values, cost

In [20]:
system_role = ''
print(system_role)




In [21]:
chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)

thoughts = '\n'.join(sub_transcripts[54])
domain_questions = [x['text'] for x in questions_list if x['domain']=='N']
message = get_NEO_FFI_prompt(thoughts, domain_questions[0])
required_values, cost = generic_messaging_wrapper(chat, system_role, message)

print(required_values)

INFO:root:processing message with 3036 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Very Accurate


In [24]:
from multiprocessing import Process, Manager

def multiproc_neo_wrapper(thoughts, question_i, name, return_dict, model_to_evaluate):
    try:
        chat = ChatGPT(model_provider_order=model_to_evaluate)
        dim_question = questions_list[question_i]['text']
        message = get_NEO_FFI_prompt(thoughts, dim_question)
        required_values, cost = generic_messaging_wrapper(chat, system_role, message)
        return_dict[f'{name}'] = (required_values, cost)
    except Exception as e:
        pass  # silent failures 

In [26]:
import resource

soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print("Soft limit:", soft, "Hard limit:", hard)
# Raise soft limit (if you have permission):
resource.setrlimit(resource.RLIMIT_NOFILE, (30000, hard))
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print("New Soft limit:", soft, "New Hard limit:", hard)

Soft limit: 4096 Hard limit: 262144
New Soft limit: 30000 New Hard limit: 262144


In [27]:
def run_until_resolved(good_th_dict, questions, return_dict, model_to_evaluate,
                       max_attempts=5, sleep_between=0.1, max_concurrent_calls=7500):
    return_dict = Manager().dict()
    
    n_subs = len(good_th_dict)
    total_requests = len(good_th_dict) * len(questions)
    start_time = time.time()
    
    def format_time(seconds):
        mins, secs = divmod(int(seconds), 60)
        hrs, mins = divmod(mins, 60)
        return f"{hrs:02d}:{mins:02d}:{secs:02d}"
    
    for attempt in range(1, max_attempts + 1):
        procs = []
        n_missing = 0
        missing_items = []

        # Check for missing entries
        for subject in good_th_dict:
            for question_i in range(len(questions)):
                name = f'{subject}-{question_i}'
                if name not in return_dict:
                    n_missing += 1
                    missing_items.append((subject, question_i, name))
                else:
                    if return_dict[name][0] is None:
                        n_missing += 1
                        missing_items.append((subject, question_i, name))

        if n_missing == 0:
            print("All responses successfully completed.")
            break
        else:
            print(f"Attempt {attempt}: {n_missing} / {total_requests} requests missing.")

        # Dispatch missing jobs
        for (subject_i, (subject, question_i, name)) in enumerate(missing_items):
            transcript = '\n'.join([x.replace('\n', '') for x in sub_transcripts[subject]])
            proc = Process(target=multiproc_neo_wrapper, args=(transcript, question_i, name, return_dict, model_to_evaluate))
            proc.start()
            procs.append(proc)
            time.sleep(sleep_between)

            if len(procs) >= max_concurrent_calls:
                for proc in procs:
                    proc.join()
                procs = []
            
                # Intermediate update
                completed_items = dict(return_dict)
                running_cost = np.sum([completed_items[x][-1] for x in completed_items])
                n_completed = len(completed_items)
                avg_cost = running_cost / n_completed if n_completed else 0
                estimated_total_cost = avg_cost * total_requests

                elapsed_time = time.time() - start_time
                avg_time_per_call = elapsed_time / n_completed if n_completed else 0
                remaining_calls = total_requests - n_completed
                estimated_time_remaining = avg_time_per_call * remaining_calls

                clear_output(wait=True)
                print(f"Attempt {attempt}")
                print(f"Completed: {n_completed}/{total_requests}")
                print(f"Running cost: ${running_cost:.3f}")
                print(f"Estimated total cost: ${estimated_total_cost:.3f}")
                print(f"Elapsed time: {format_time(elapsed_time)}")
                print(f"Estimated time remaining: {format_time(estimated_time_remaining)}")
                
        # Final join
        for proc in procs:
            proc.join()

        running_cost = np.sum([return_dict[x][-1] for x in return_dict])
        clear_output(wait=True)
        print(f"Total running cost: {running_cost:.3f}")
        

        clear_output(wait=True)
        n_missing = 0
        # Check for missing entries
        for subject in good_th_dict:
            for question_i in range(len(questions)):
                name = f'{subject}-{question_i}'
                if name not in return_dict:
                    n_missing += 1
        completed_dict = dict(return_dict)
        completed_items = [v for v in completed_dict.values()]
        total_cost = np.sum([x[-1] for x in completed_items])
        n_completed = len(completed_items)
        avg_cost = total_cost / n_completed if n_completed else 0
        total_elapsed_time = time.time() - start_time

        print(f"Total responses expected: {total_requests}")
        print(f"Successful: {n_completed}")
        print(f"Failed: {n_missing}")
        print(f"Total cost: ${total_cost:.3f}")
        print(f"Avg cost per response: ${avg_cost:.4f}")
        print(f"Total runtime: {format_time(total_elapsed_time)}")
        
    else:
        n_missing = 0
        # Check for missing entries
        for subject in good_th_dict:
            for question_i in range(len(questions)):
                name = f'{subject}-{question_i}'
                if name not in return_dict:
                    n_missing += 1
                else:
                    if return_dict[name][0] is None:
                        n_missing += 1
                        missing_items.append((subject, question_i, name))

        print(f'!!! Max attempts reached. {n_missing} requests are still unresolved. !!!')
    return return_dict

In [None]:
def map_responses_to_integers(response_list, reverse_coding_list):
    # Define the mapping from response options to integers
    response_mapping = {
        'Very Inaccurate': 0,
        'Moderately Inaccurate': 1,
        'Neither Accurate nor Inaccurate': 2,
        'Moderately Accurate': 3,
        'Very Accurate': 4,
        None: np.nan
    }
    
    reverse_mapping = {
        'Very Inaccurate': 4,
        'Moderately Inaccurate': 3,
        'Neither Accurate nor Inaccurate': 2,
        'Moderately Accurate': 1,
        'Very Accurate': 0,
        None: np.nan
    }

    # Iterate through both lists and apply the appropriate mapping based on reverse coding
    mapped_responses = [
        reverse_mapping[response] if reverse_coding else response_mapping[response]
        for response, reverse_coding in zip(response_list, reverse_coding_list)
    ]

    return mapped_responses

In [30]:
llama_4_maverick = ('meta-llama/llama-4-maverick', ['Fireworks', 'Together', 'Kluster'], 'llama_maverick')
gemini_flash = ('google/gemini-2.5-flash-preview-05-20:thinking', ['Google', 'Google AI Studio'], 'gemini_flash')
qwen_235B = ('qwen/qwen3-235b-a22b', ['DeepInfra', 'Kluster', 'Parasail', 'Together', 'Nebius'], 'qwen3_235B')  
gpt_41 = ('openai/gpt-4.1', ['OpenAI'], 'gpt_41')
gpt_41_mini = ('openai/gpt-4.1-mini', ['OpenAI'], 'gpt_41_mini')
claude_sonnet = ('anthropic/claude-3.7-sonnet', ['Anthropic', 'Amazon Bedrock', 'Google', 'Google AI Studio'], 'claude_sonnet')
grok_3 = ('x-ai/grok-3-beta', ['xAI'], 'grok3_beta')


for MODEL_TO_EVALUATE in [llama_4_maverick,
                          qwen_235B,
                          gpt_41,
                          gpt_41_mini,
                          claude_sonnet,
                          grok_3,
                          gemini_flash
                          ]:

    return_dict = run_until_resolved(sub_transcripts, questions_list, return_dict, MODEL_TO_EVALUATE)
    
    n_subs = len(sub_transcripts)
    n_missing = 0

    for (subject_i, subject) in enumerate(sub_transcripts):
        transcript = '\n'.join([x.replace('\n', '') for x in sub_transcripts[subject]])
        for question_i in range(len(questions_list)):
            name = f'{subject}-{question_i}'
            if name not in return_dict:
                n_missing += 1
                print(name)

    assert n_missing == 0
    
    out_dict = dict()
    sub_ids = []
    for subject in sub_transcripts:
        for question_i in range(1, len(questions_list)+1):
            if subject not in out_dict:
                out_dict[subject] = {}
            v = return_dict[f'{subject}-{question_i-1}'][0]
            if v is None: raise Exception(f'{subject}-{question_i} is None')
            out_dict[subject][f'question_{question_i}']  = v
        sub_ids.append(subject)
        
    out_df = pd.DataFrame(out_dict).T
    out_df = out_df[[f'question_{i+1}' for i in range(len(questions_list))]]
    out_df['subject'] = pd.DataFrame(out_dict).T.index
    out_df['participantID'] = sub_ids

    name_mapping_aapecs = {
        'O': 'neoOpenness',
        'C': 'neoConscientiousness',
        'E': 'neoExtraversion',
        'A': 'neoAgreeableness',
        'N': 'neoNeuroticism'
    }

    gpt_neo_scores = pd.DataFrame()
    gpt_neo_scores['participantID'] = out_df.index.values
    for dim in ['O', 'C', 'E', 'A', 'N']:
        dim_questions = [x['text'].split(':')[0].lower().replace(' ', '_') for x in questions_list if x['domain']==dim]
        is_reverse_coded = [questions_list[q_i-1]['keyed']!='plus' for q_i in [int(x.split('_')[1]) for x in dim_questions]]
        v = [np.mean(map_responses_to_integers(a, is_reverse_coded)) for a in out_df[dim_questions].values]
        gpt_neo_scores[name_mapping_aapecs[dim]] = v
        
    scales_df = pd.read_csv(f'{DATA_ROOT}/selfReport.csv')
    scales_df = scales_df[scales_df.participantID.isin([int(x) for x in sub_lengths])]
    scales_df = scales_df[['participantID']+list(cols_of_interest)]

    model_canonical_name = MODEL_TO_EVALUATE[-1]
    gpt_neo_scores.to_csv(f'{DATA_ROOT}/aapecs_{model_canonical_name}_text_per_question_scores.csv')
    
    out_df = pd.DataFrame(out_dict).T
    out_df = out_df[[f'question_{i+1}' for i in range(len(questions_list))]]
    out_df['subject'] = pd.DataFrame(out_dict).T.index
    out_df['participantID'] = out_df.index.values
    out_df.to_csv(f'{DATA_ROOT}/aapecs_{model_canonical_name}_text_per_question_responses.csv')

Total responses expected: 12960
Successful: 12960
Failed: 0
Total cost: $87.135
Avg cost per response: $0.0067
Total runtime: 00:24:27
All responses successfully completed.
