In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

def set_css_in_cell_output():
    display(HTML('''
        <style>
            .jupyter-widgets {color: #d5d5d5 !important;}
            .widget-label {color: #d5d5d5 !important;}
        </style>
    '''))

get_ipython().events.register('pre_run_cell', set_css_in_cell_output)
from IPython.core import ultratb
ultratb.VerboseTB._tb_highlight = "bg:#0D0D0D"

In [2]:
import pickle
import time
import tiktoken
import datetime
import json
import requests
import traceback
import re
import numpy as np
from tqdm.notebook import tqdm
from jsonschema import validate
from openai import OpenAI, RateLimitError, APITimeoutError, InternalServerError, Timeout
from tenacity import retry, stop_after_attempt, wait_incrementing, retry_if_exception_type, after_log, before_sleep_log
import logging
import mysql.connector
from mysql.connector import Error
from IPython.display import clear_output

In [None]:
OPENROUTER_API_KEY = ''        # private openrouter api key

In [4]:
request_limit_per_minute = 500
token_limit_per_minute = 2e6

request_timeout_seconds = 120   # maximum wait time for openAI to respond before triggering request timeout 
request_max_retries = 1         # number to times to automatically retry failed requests
tpm_wait_polling_seconds = 10    # if our internal TPM estimate thinks TPM limit is exceeded, how often to check if limit cleared

# global logger for static classes
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)

In [6]:
class ChatGPT:
    def __init__(self, model_provider_order,
                 halt_on_error=True,
                 is_verbose=True,
                 timeout=request_timeout_seconds,
                 max_retries=request_max_retries,
                 request_limit_per_minute=request_limit_per_minute,
                 token_limit_per_minute=token_limit_per_minute,
                 tpm_wait_polling_seconds=tpm_wait_polling_seconds,
                 logger=logger,
                 api_key=OPENROUTER_API_KEY,
                 limit_manager_db_password=LIMIT_MANAGER_DB_PASSWORD):
        self.model, self.provider_order, self.model_canonical_name = model_provider_order
        self.halt_on_error = halt_on_error
        self.is_verbose = is_verbose
        self.tpm_wait_polling_seconds = tpm_wait_polling_seconds
        self.request_limit_per_minute = request_limit_per_minute
        self.request_delay_seconds = 60.0 / request_limit_per_minute
        self.token_limit_per_minute = token_limit_per_minute
        self.response_history = []
        self.message_history = {}
        self.logger = logger
        self.limit_manager_db_password = limit_manager_db_password
        likert_options = [
            "Strongly Disagree",
            "Strongly Agree",
            "Disagree",
            "Neutral",
            "Agree",
        ]
        # Sort by length to match longer options (e.g., "Strongly Agree") before shorter ones (e.g., "Agree")
        self.likert_options = sorted(likert_options, key=len, reverse=True)
        self.default_seed = 1 if 'x-ai' in self.model else 0
        
        self.client = OpenAI(base_url="https://openrouter.ai/api/v1",
                             api_key = api_key,
                             timeout=timeout,
                             max_retries=max_retries)

        
    def extract_likert_response(self, content):
        content_lower = content.lower()
        for option in self.likert_options:
            pattern = r'\b' + re.escape(option.lower()) + r'\b'
            match = re.search(pattern, content_lower)
            if match:
                return json.dumps({"response": option})
        raise Exception("No Likert match found in: ", content)    
    
    
    def get_running_cost_num_prompt_completion_tokens(self):
        """
        This function computes the total cost (estimated) of all
        messages sent by the instance of ChatGPT called from
        Returns: total_running_cost, total_num_prompt_tokens, total_num_response_tokens
        """
        n_prompt_tokens = np.sum([x.usage.prompt_tokens for x in self.response_history])
        n_completion_tokens = np.sum([x.usage.completion_tokens for x in self.response_history])
        total_cost = sum(r.usage.cost for r in self.response_history)
        return (total_cost,
                n_prompt_tokens,
                n_completion_tokens)

    def get_key_usage_credits(self):
        # get what OpenRouter says the api key has used in total
        # returns usage, total credits available
        resp = requests.get(
            "https://openrouter.ai/api/v1/credits",
            headers={"Authorization": f"Bearer {self.client.api_key}"}
        )
        resp.raise_for_status()
        info = resp.json()["data"]
        return info["total_usage"], info["total_credits"]

    # retry failing requests starting with 10 second wait,
    # increasing wait time by 10 seconds each retry, up to a max window of 120s (or 5 times)
    # the goal is to try to avoid hitting backoff,
    # we treat this as a last resort because of its runtime cost
    @retry(wait=wait_incrementing(start=10, increment=10, max=120),
           stop=stop_after_attempt(5),
           retry=retry_if_exception_type((RateLimitError, APITimeoutError, InternalServerError, Timeout)),
           before_sleep=before_sleep_log(logger, logging.INFO),
           after=after_log(logger, logging.INFO))
    def completion_with_backoff(self, client, **kwargs):
        return client.chat.completions.create(**kwargs)


    def check_internal_TPM_tracker(self, n_message_tokens):
        """
        Checks internal TPM count to see if a message with length = n_message_tokens
        can be sent. If not, it waits (sleeps - blocking) until the message delivery
        meets into TPM limit
        """
        now = datetime.datetime.now()
        one_minute_ago = now + datetime.timedelta(seconds=-60)
        self.token_count_history = [x for x in self.token_count_history if x[1] > one_minute_ago]
        n_tokens_past_minute = np.sum([x[0] for x in self.token_count_history]) + n_message_tokens
        # fixed delay waiting if TPM exceeded over past minute
        # this is cpu polling, so it doesnt cost money or much compute
        while n_tokens_past_minute > self.token_limit_per_minute:
            if self.is_verbose: self.logger.info(f'Internal TPM limit exceeded, waiting for {self.tpm_wait_polling_seconds} seconds...')
            time.sleep(self.tpm_wait_polling_seconds)
            now = datetime.datetime.now()
            one_minute_ago = now + datetime.timedelta(seconds=-60)
            self.token_count_history = [x for x in self.token_count_history if x[1] > one_minute_ago]
            n_tokens_past_minute = np.sum([x[0] for x in self.token_count_history]) + n_message_tokens
        now = datetime.datetime.now()
        self.token_count_history.append((n_message_tokens, now))


    def send_message(self, system_role, message, validate_response=True):
        """
        This is the primary function used to send messages to GPT and get responses
        Steps are:
          - check that json schema meets our basic requirements
          - handle RPM and TPM limits as best as we can
            (when openai rejects requests for exceeding limits its much slower)
          - build and send the message using openai ChatCompletion api
          - perform basic validation on GPT's response
        This function either returns a ChatCompletion response object or None (if failure occurred)
        Errors are propogated using raised Exceptions
        """
        # sleep based on RPM limit (lazy logic, avoids keeping running count of actual requests per minute)
        time.sleep(self.request_delay_seconds)

        # check TPM limit (not lazy, uses running count of tokens per minute)
        try:
            encoding = tiktoken.encoding_for_model(self.model)
        except:
            encoding = tiktoken.encoding_for_model('gpt-4')
        n_message_tokens = len(encoding.encode(system_role)) + len(encoding.encode(message))
        self.logger.info(f'processing message with {n_message_tokens} tokens...')
        if n_message_tokens > self.token_limit_per_minute:
            return self.bad_response_output(f'Unable to send message as it exceeds TPM. Number of tokens in message = {n_message_tokens}')
        
        # build and send message over openai-api
        message_id = len(self.message_history.keys())
        self.message_history[message_id] = [] if 'x-ai' in self.model else [{"role": "system", "content": system_role}]
        self.message_history[message_id].append({"role": "user", "content": message})
        try:
            response = self.completion_with_backoff(self.client,
                                                    model=self.model,
                                                    messages=self.message_history[message_id],
                                                    temperature=0,
                                                    stream=False,
                                                    extra_body={"usage": {"include": True},
                                                                "reasoning": {# One of the following (not both):
                                                                              "effort": "medium", # Can be "high", "medium", or "low" (OpenAI-style)
                                                                              # Optional: Default is false. All models support this.
                                                                              "exclude": False # Set to true to exclude reasoning tokens from response
                                                                              },
                                                                "provider": {"order": self.provider_order, 
                                                                             "sort": "price",
                                                                             "data_collection": "deny",
                                                                             "allow_fallbacks": False}},
                                                    seed=self.default_seed, logprobs=False)
            
            self.response_history.append(response)
            # reasoning models dont return a content field
            if response.choices[0].message.content is None:
                self.bad_response_output(f'None in message content')
                return None
            elif response.choices[0].message.content == '':
                if hasattr(response.choices[0].message, 'reasoning'):
                    if response.choices[0].message.reasoning != '':
                        response.choices[0].message.content = response.choices[0].message.reasoning
            else:
                pass
            try:
                likert_response = self.extract_likert_response(response.choices[0].message.content.strip().lower())
                response.choices[0].message.content = likert_response
            except:
                self.bad_response_output(f'GPT response didnt match likert options')
                return None
        except Exception as e:
            if self.halt_on_error:
                raise
            else:
                if self.is_verbose:
                    str_e = str(e)
                    self.logger.info(f'An exception occurred: {str_e}')
                    self.logger.info(traceback.format_exc())
                return None

        return (response, message_id)
        

    def bad_response_output(self, error):
        # general function for informing the user when an error occurs
        if self.halt_on_error:
            raise Exception(error)
        else:
            if self.is_verbose:
                self.logger.info(f'Error - {error}')
        return None

In [7]:
llama_4_maverick = ('meta-llama/llama-4-maverick', ['Fireworks', 'Together', 'Kluster'], 'llama_maverick')
gemini_flash = ('google/gemini-2.5-flash-preview-05-20:thinking', ['Google', 'Google AI Studio'], 'gemini_flash')
qwen_235B = ('qwen/qwen3-235b-a22b', ['DeepInfra', 'Kluster', 'Parasail', 'Together', 'Nebius'], 'qwen3_235B')  
gpt_41 = ('openai/gpt-4.1', ['OpenAI'], 'gpt_41')
gpt_41_mini = ('openai/gpt-4.1-mini', ['OpenAI'], 'gpt_41_mini')
claude_sonnet = ('anthropic/claude-3.7-sonnet', ['Anthropic', 'Amazon Bedrock', 'Google', 'Google AI Studio'], 'claude_sonnet')
grok_3 = ('x-ai/grok-3-beta', ['xAI'], 'grok3_beta')

# Example: We expect a simple string response from GPT

In [None]:
MODEL_TO_EVALUATE = gpt_41_mini

In [4]:
def example_messaging_wrapper(chat, system_role, message):
    # with halt_on_error set to True in ChatGPT class, 
    # we use exception propogation to handle errors and edge-cases
    response, message_history_id = None, -1
    try:
        response, message_history_id = chat.send_message(system_role=system_role,
                                                         message=message, 
                                                         validate_response=True)
        assert response is not None
        response_json = json.loads(response.choices[0].message.content)
        response_str = response_json["response"]
    except Exception as e:
        response_json = {}
        response_str = ''
        chat.logger.info(f'Messaging wrapper failure - {str(e)}')
        print(traceback.format_exc())

    cost, n_prompt_tokens, n_completion_tokens = chat.get_running_cost_num_prompt_completion_tokens()
    last_messages_sent_to_gpt = '' if (message_history_id not in chat.message_history) else chat.message_history[message_history_id]
    print(f'Messages to GPT:\n{last_messages_sent_to_gpt}')
    print(f'Response from GPT:\n{response_str}')
    print(f'Cost: ${cost:.5f}')
    
    return response, message_history_id

In [9]:
# specify system role and user message
system_role = 'you are a helpful assistant.'
likert_options = [
    "Strongly Disagree",
    "Strongly Agree",
    "Disagree",
    "Neutral",
    "Agree",
]
likert_options_str = '\n'.join(likert_options)

message = """
Simulate your response to the prompt below without being provided the question and thoughts. This is for testing purposes.

---

Your task is to respond to the following NEO-FFI question based on the participant's spontaneous stream of thoughts, provided below. Respond as though you are the individual who generated these thoughts, reflecting their personality traits.
Base your answer on inferred personality traits. Think carefully about what the thoughts imply about tendencies and behaviors.

For each question, select the most appropriate option:
- Strongly Disagree: The statement is definitely false or the participant would strongly disagree with it.
- Disagree: The statement is mostly false or the participant would generally disagree with it.
- Neutral: The participant would be neutral on the statement, cannot decide, or find the statement equally true and false.
- Agree: The statement is mostly true or the participant would generally agree with it.
- Strongly Agree: The statement is definitely true or the participant would strongly agree with it.

NEO-FFI question to answer:
{question}

Participant's spontaneous stream of thoughts:
{thoughts}

Your response must be exactly one of:
Strongly Disagree  
Disagree  
Neutral  
Agree  
Strongly Agree

Do not include any explanation, punctuation, or additional text. Return only the exact phrase from the list above.
"""

# create a single instance of ChatGPT 
# so that we can keep track of running costs
chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)

response, message_history_id = example_messaging_wrapper(chat, system_role, message)

INFO:root:processing message with 271 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Messages to GPT:
[{'role': 'system', 'content': 'you are a helpful assistant.'}, {'role': 'user', 'content': "\nSimulate your response to the prompt below without being provided the question and thoughts. This is for testing purposes.\n\n---\n\nYour task is to respond to the following NEO-FFI question based on the participant's spontaneous stream of thoughts, provided below. Respond as though you are the individual who generated these thoughts, reflecting their personality traits.\nBase your answer on inferred personality traits. Think carefully about what the thoughts imply about tendencies and behaviors.\n\nFor each question, select the most appropriate option:\n- Strongly Disagree: The statement is definitely false or the participant would strongly disagree with it.\n- Disagree: The statement is mostly false or the participant would generally disagree with it.\n- Neutral: The participant would be neutral on the statement, cannot decide, or find the statement equally true and false.\

# SST

In [5]:
import pandas as pd
from scipy import stats

In [6]:
DATA_ROOT = './data/SST/'  # please contact the authors for access to the data

good_th_dict = pickle.load(open(f'{DATA_ROOT}/SST_data.pickle', 'rb'))
print(len([x for x in good_th_dict]))

60


In [14]:
questions = [
    "I am not a worrier.",
    "I like to have a lot of people around me.",
    "I don’t like to waste my time daydreaming.",
    "I try to be courteous to everyone I meet.",
    "I keep my belongings neat and clean.",
    "I often feel inferior to others.",
    "I laugh easily.",
    "Once I find the right way to do something I stick to it.",
    "I often get into arguments with my family and co-workers.",
    "I’m pretty good about pacing myself so as to get things done on time.",
    "When I’m under a great deal of stress sometimes I feel like I’m going to pieces.",
    "I don’t consider myself especially 'light-hearted.'",
    "I am intrigued by the patterns I find in art and nature.",
    "Some people think I’m selfish and egotistical.",
    "I am not a very methodical person.",
    "I rarely feel lonely or blue.",
    "I really enjoy talking to people.",
    "I believe letting students hear controversial speakers can only confuse and mislead them.",
    "I would rather cooperate with others than compete with them.",
    "I try to perform all the tasks assigned to me conscientiously.",
    "I often feel tense and jittery.",
    "I like to be where the action is.",
    "Poetry has little or no effect on me.",
    "I tend to be cynical and skeptical of others’ intentions.",
    "I have a clear set of goals and work toward them in an orderly fashion.",
    "Sometimes I feel completely worthless.",
    "I usually prefer to do things alone.",
    "I often try new and foreign foods.",
    "I believe that most people will take advantage of you if you let them.",
    "I waste a lot of time before settling down to work.",
    "I rarely feel fearful or anxious.",
    "I often feel as if I’m bursting with energy.",
    "I seldom notice the moods or feelings that different environments produce.",
    "Most people I know like me.",
    "I work hard to accomplish my goals.",
    "I often get angry at the way people treat me.",
    "I am a cheerful high-spirited person.",
    "I believe we should look to our religious authorities for decisions on moral issues.",
    "Some people think of me as cold and calculating.",
    "When I make a commitment I can always be counted on to follow through.",
    "Too often when things go wrong I get discouraged and feel like giving up.",
    "I am not a cheerful optimist.",
    "Sometimes when I am reading poetry or looking at a work of art I feel a chill or wave of excitement.",
    "I’m hard-headed and tough-minded in my attitudes.",
    "Sometimes I’m not as dependable or reliable as I should be.",
    "I am seldom sad or depressed.",
    "My life is fast-paced.",
    "I have little interest in speculating on the nature of the universe or the human condition.",
    "I generally try to be thoughtful and considerate.",
    "I am a productive person who always gets the job done.",
    "I often feel helpless and want someone else to solve my problems.",
    "I am a very active person.",
    "I have a lot of intellectual curiosity.",
    "If I don’t like people I let them know it.",
    "I never seem to be able to get organized.",
    "At times I have been so ashamed I just wanted to hide.",
    "I would rather go my own way than be a leader of others.",
    "I often enjoy playing with theories or abstract ideas.",
    "If necessary I am willing to manipulate people to get what I want.",
    "I strive for excellence in everything I do."
]
for i in range(len(questions)):
    questions[i] = f'Question {i+1}: ' + questions[i]

In [17]:
def get_NEO_FFI_prompt(thoughts, question):
    prompt_template = f"""
Your task is to respond to the following NEO-FFI question based on the participant's spontaneous stream of thoughts, provided below. Respond as though you are the individual who generated these thoughts, reflecting their personality traits.
Base your answer on inferred personality traits. Think carefully about what the thoughts imply about tendencies and behaviors.

For each question, select the most appropriate option:
- Strongly Disagree: The statement is definitely false or the participant would strongly disagree with it.
- Disagree: The statement is mostly false or the participant would generally disagree with it.
- Neutral: The participant would be neutral on the statement, cannot decide, or find the statement equally true and false.
- Agree: The statement is mostly true or the participant would generally agree with it.
- Strongly Agree: The statement is definitely true or the participant would strongly agree with it.

NEO-FFI question to answer:
{question}

Participant's spontaneous stream of thoughts:
{thoughts}

Your response must be exactly one of:
Strongly Disagree  
Disagree  
Neutral  
Agree  
Strongly Agree

Do not include any explanation, punctuation, or additional text. Return only the exact phrase from the list above.
"""
    return prompt_template

In [19]:
def generic_messaging_wrapper(chat, system_role, message):
    # with halt_on_error set to True in ChatGPT class, 
    # we use exception propogation to handle errors and edge-cases
    message_history_id = -1
    required_values = None
    try:
        response, message_history_id = chat.send_message(system_role=system_role,
                                                         message=message)
        assert response is not None
        response_json = json.loads(response.choices[0].message.content)
        required_values = response_json['response']
    except Exception as e:
        response_json = {}
        chat.logger.info(f'Messaging wrapper failure - {str(e)}')
        print(traceback.format_exc())

    cost, n_prompt_tokens, n_completion_tokens = chat.get_running_cost_num_prompt_completion_tokens()
    return required_values, cost

In [20]:
system_role = ''
print(system_role)




In [21]:
chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)
thoughts = '\n'.join([a for b in good_th_dict['5'] for a in b])

question_i = 0
question = questions[question_i]
message = get_NEO_FFI_prompt(thoughts, question)
required_values, cost = generic_messaging_wrapper(chat, system_role, message)

print(required_values)

INFO:root:processing message with 4456 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Disagree


In [22]:
from multiprocessing import Process, Manager

def multiproc_neo_wrapper(thoughts, question_i, name, return_dict, model_to_evaluate):
    try:
        chat = ChatGPT(model_provider_order=model_to_evaluate)
        question = questions[question_i]
        message = get_NEO_FFI_prompt(thoughts, question)
        required_values, cost = generic_messaging_wrapper(chat, system_role, message)
        return_dict[f'{name}'] = (required_values, cost)
    except Exception as e:
        pass  # silent failures 

In [23]:
import resource

soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print("Soft limit:", soft, "Hard limit:", hard)
# Raise soft limit (if you have permission):
resource.setrlimit(resource.RLIMIT_NOFILE, (30000, hard))
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print("New Soft limit:", soft, "New Hard limit:", hard)

Soft limit: 4096 Hard limit: 262144
New Soft limit: 30000 New Hard limit: 262144


In [24]:
def run_until_resolved(good_th_dict, questions, MODEL_TO_EVALUATE,
                       max_attempts=5, sleep_between=0.1, max_concurrent_calls=1000):
    
    return_dict = Manager().dict()
    
    n_subs = len(good_th_dict)
    total_requests = len(good_th_dict) * len(questions)
    start_time = time.time()
    
    def format_time(seconds):
        mins, secs = divmod(int(seconds), 60)
        hrs, mins = divmod(mins, 60)
        return f"{hrs:02d}:{mins:02d}:{secs:02d}"
    
    for attempt in range(1, max_attempts + 1):
        procs = []
        n_missing = 0
        missing_items = []

        # Check for missing entries
        for subject in good_th_dict:
            for question_i in range(len(questions)):
                name = f'{subject}-{question_i}'
                if name not in return_dict:
                    n_missing += 1
                    missing_items.append((subject, question_i, name))
                else:
                    if return_dict[name][0] is None:
                        n_missing += 1
                        missing_items.append((subject, question_i, name))

        if n_missing == 0:
            print("All responses successfully completed.")
            break
        else:
            print(f"Attempt {attempt}: {n_missing} / {total_requests} requests missing.")

        # Dispatch missing jobs
        for (subject_i, (subject, question_i, name)) in enumerate(missing_items):
            transcript = '\n'.join([a for b in good_th_dict[subject] for a in b])
            proc = Process(target=multiproc_neo_wrapper, args=(transcript, question_i, name, return_dict, MODEL_TO_EVALUATE))
            proc.start()
            procs.append(proc)
            time.sleep(sleep_between)

            if len(procs) >= max_concurrent_calls:
                for proc in procs:
                    proc.join()
                procs = []
            
                # Intermediate update
                completed_items = dict(return_dict)
                running_cost = np.sum([completed_items[x][-1] for x in completed_items])
                n_completed = len(completed_items)
                avg_cost = running_cost / n_completed if n_completed else 0
                estimated_total_cost = avg_cost * total_requests

                elapsed_time = time.time() - start_time
                avg_time_per_call = elapsed_time / n_completed if n_completed else 0
                remaining_calls = total_requests - n_completed
                estimated_time_remaining = avg_time_per_call * remaining_calls

                clear_output(wait=True)
                print(f"Attempt {attempt}")
                print(f"Completed: {n_completed}/{total_requests}")
                print(f"Running cost: ${running_cost:.3f}")
                print(f"Estimated total cost: ${estimated_total_cost:.3f}")
                print(f"Elapsed time: {format_time(elapsed_time)}")
                print(f"Estimated time remaining: {format_time(estimated_time_remaining)}")
        
        # Final join
        for proc in procs:
            proc.join()

        running_cost = np.sum([return_dict[x][-1] for x in return_dict])
        clear_output(wait=True)
        print(f"Total running cost: {running_cost:.3f}")
        

        clear_output(wait=True)
        n_missing = 0
        # Check for missing entries
        for subject in good_th_dict:
            for question_i in range(len(questions)):
                name = f'{subject}-{question_i}'
                if name not in return_dict:
                    n_missing += 1
        completed_dict = dict(return_dict)
        completed_items = [v for v in completed_dict.values()]
        total_cost = np.sum([x[-1] for x in completed_items])
        n_completed = len(completed_items)
        avg_cost = total_cost / n_completed if n_completed else 0
        total_elapsed_time = time.time() - start_time

        print(f"Total responses expected: {total_requests}")
        print(f"Successful: {n_completed}")
        print(f"Failed: {n_missing}")
        print(f"Total cost: ${total_cost:.3f}")
        print(f"Avg cost per response: ${avg_cost:.4f}")
        print(f"Total runtime: {format_time(total_elapsed_time)}")
        
    else:
        n_missing = 0
        # Check for missing entries
        for subject in good_th_dict:
            for question_i in range(len(questions)):
                name = f'{subject}-{question_i}'
                if name not in return_dict:
                    n_missing += 1
                else:
                    if return_dict[name][0] is None:
                        n_missing += 1
                        missing_items.append((subject, question_i, name))

        print(f'!!! Max attempts reached. {n_missing} requests are still unresolved. !!!')
    return return_dict

In [25]:
def score_neo_ffi(df):
    # Scoring mapping
    scoring_map = {'Strongly Disagree': 0, 'Disagree': 1, 'Neutral': 2, 'Agree': 3, 'Strongly Agree': 4}

    # Negative scoring mapping (reverse scoring)
    reverse_scoring_map = {'Strongly Disagree': 4, 'Disagree': 3, 'Neutral': 2, 'Agree': 1, 'Strongly Agree': 0}

    # Define the items for each dimension based on the provided scoring guide
    dimensions = {
        'Neuroticism': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56],
        'Extraversion': [2, 7, 12, 17, 22, 27, 32, 37, 42, 47, 52, 57],
        'Openness': [3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58],
        'Agreeableness': [4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59],
        'Conscientiousness': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    }

    # Reverse scoring items based on the provided scoring guide
    reverse_items = [1, 16, 31, 46, 12, 27, 42, 57, 3, 8, 18, 23, 38, 48, 9, 14, 24, 29, 39, 44, 54, 59, 15, 30, 45, 55]

    # Calculate scores for each dimension
    scores = pd.DataFrame(df['subject'], columns=['subject'])
    
    for dimension, items in dimensions.items():
        score = 0
        for item in items:
            column_name = f'question_{item}'
            if item in reverse_items:
                unmapped = set(df[column_name]) - set(reverse_scoring_map)
                if unmapped: raise ValueError(f"Unmapped values found: {unmapped}")
                df[column_name] = df[column_name].map(reverse_scoring_map)
            else:
                unmapped = set(df[column_name]) - set(scoring_map)
                if unmapped: raise ValueError(f"Unmapped values found: {unmapped}")
                df[column_name] = df[column_name].map(scoring_map)
        scores[dimension] = df[[f'question_{item}' for item in items]].sum(axis=1)
        
    return scores, df

In [27]:
llama_4_maverick = ('meta-llama/llama-4-maverick', ['Fireworks', 'Together', 'Kluster'], 'llama_maverick')
gemini_flash = ('google/gemini-2.5-flash-preview-05-20:thinking', ['Google', 'Google AI Studio'], 'gemini_flash')
qwen_235B = ('qwen/qwen3-235b-a22b', ['DeepInfra', 'Kluster', 'Parasail', 'Together', 'Nebius'], 'qwen3_235B')  
gpt_41 = ('openai/gpt-4.1', ['OpenAI'], 'gpt_41')
gpt_41_mini = ('openai/gpt-4.1-mini', ['OpenAI'], 'gpt_41_mini')
claude_sonnet = ('anthropic/claude-3.7-sonnet', ['Anthropic', 'Amazon Bedrock', 'Google', 'Google AI Studio'], 'claude_sonnet')
grok_3 = ('x-ai/grok-3-beta', ['xAI'], 'grok3_beta')


for MODEL_TO_EVALUATE in [llama_4_maverick,
                          gemini_flash,
                          qwen_235B,
                          gpt_41,
                          gpt_41_mini,
                          claude_sonnet,
                          grok_3
                          ]:

    
    return_dict = run_until_resolved(good_th_dict, questions, MODEL_TO_EVALUATE)

    procs = []
    n_subs = len(good_th_dict)
    n_missing = 0

    for (subject_i, subject) in enumerate(good_th_dict):
        transcript = '\n'.join([a for b in good_th_dict[subject] for a in b])
        for question_i in range(len(questions)):
            name = f'{subject}-{question_i}'
            if name not in return_dict:
                n_missing += 1

    assert n_missing == 0

    out_dict = dict()
    for subject in good_th_dict:
        for question_i in range(1, 61):
            if subject not in out_dict:
                out_dict[subject] = {}
            out_dict[subject][f'question_{question_i}'] = return_dict[f'{subject}-{question_i-1}'][0]
            if out_dict[subject][f'question_{question_i}'] is None: raise Exception(f'{subject}-{question_i} is None')


    out_df = pd.DataFrame(out_dict).T
    out_df = out_df[[f'question_{i+1}' for i in range(len(questions))]]
    out_df['subject'] = pd.DataFrame(out_dict).T.index

    gpt_neo_scores, rating_df = score_neo_ffi(out_df)
    gpt_neo_scores.columns = [x.lower() for x in gpt_neo_scores.columns]

    scales_df = pd.read_csv(f'{DATA_ROOT}/SST_scales.csv')
    scales_df = scales_df.merge(gpt_neo_scores, on='subject', how='outer')
    cols_to_keep = ['subject'] + [x for x in scales_df.columns if '_x' in x] + [x for x in scales_df.columns if '_y' in x]
    scales_df = scales_df[cols_to_keep].dropna()

    model_canonical_name = MODEL_TO_EVALUATE[-1]
    gpt_neo_scores.to_csv(f'{DATA_ROOT}/sst_{model_canonical_name}_text_per_question_scores.csv')

    out_df = pd.DataFrame(out_dict).T
    out_df = out_df[[f'question_{i+1}' for i in range(len(questions))]]
    out_df['subject'] = pd.DataFrame(out_dict).T.index
    out_df.to_csv(f'{DATA_ROOT}/sst_{model_canonical_name}_text_per_question_responses.csv')

Total responses expected: 3600
Successful: 3600
Failed: 0
Total cost: $5.321
Avg cost per response: $0.0015
Total runtime: 00:08:26
All responses successfully completed.
