In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

def set_css_in_cell_output():
    display(HTML('''
        <style>
            .jupyter-widgets {color: #d5d5d5 !important;}
            .widget-label {color: #d5d5d5 !important;}
        </style>
    '''))

get_ipython().events.register('pre_run_cell', set_css_in_cell_output)
from IPython.core import ultratb
ultratb.VerboseTB._tb_highlight = "bg:#0D0D0D"

In [2]:
import pickle
import time
import tiktoken
import datetime
import json
import requests
import traceback
import re
import numpy as np
from tqdm.notebook import tqdm
from jsonschema import validate
from openai import OpenAI, RateLimitError, APITimeoutError, InternalServerError, Timeout
from tenacity import retry, stop_after_attempt, wait_incrementing, retry_if_exception_type, after_log, before_sleep_log
import logging
import mysql.connector
from mysql.connector import Error
from IPython.display import clear_output

In [None]:
OPENROUTER_API_KEY = ''        # private openrouter api key

In [4]:
request_limit_per_minute = 500
token_limit_per_minute = 2e6

request_timeout_seconds = 120   # maximum wait time for openAI to respond before triggering request timeout 
request_max_retries = 1         # number to times to automatically retry failed requests
tpm_wait_polling_seconds = 10    # if our internal TPM estimate thinks TPM limit is exceeded, how often to check if limit cleared

# global logger for static classes
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)

In [6]:
class ChatGPT:
    def __init__(self, model_provider_order,
                 halt_on_error=True,
                 is_verbose=True,
                 timeout=request_timeout_seconds,
                 max_retries=request_max_retries,
                 request_limit_per_minute=request_limit_per_minute,
                 token_limit_per_minute=token_limit_per_minute,
                 tpm_wait_polling_seconds=tpm_wait_polling_seconds,
                 logger=logger,
                 api_key=OPENROUTER_API_KEY,
                 limit_manager_db_password=LIMIT_MANAGER_DB_PASSWORD):
        self.model, self.provider_order, self.model_canonical_name = model_provider_order
        self.halt_on_error = halt_on_error
        self.is_verbose = is_verbose
        self.tpm_wait_polling_seconds = tpm_wait_polling_seconds
        self.request_limit_per_minute = request_limit_per_minute
        self.request_delay_seconds = 60.0 / request_limit_per_minute
        self.token_limit_per_minute = token_limit_per_minute
        self.response_history = []
        self.message_history = {}
        self.logger = logger
        self.limit_manager_db_password = limit_manager_db_password
        likert_options = [
            "Strongly Disagree",
            "Strongly Agree",
            "Disagree",
            "Neutral",
            "Agree",
        ]
        # Sort by length to match longer options (e.g., "Strongly Agree") before shorter ones (e.g., "Agree")
        self.likert_options = sorted(likert_options, key=len, reverse=True)
        self.default_seed = 1 if 'x-ai' in self.model else 0
        
        self.client = OpenAI(base_url="https://openrouter.ai/api/v1",
                             api_key = api_key,
                             timeout=timeout,
                             max_retries=max_retries)

        
    def extract_likert_response(self, content):
        content_lower = content.lower()
        for option in self.likert_options:
            pattern = r'\b' + re.escape(option.lower()) + r'\b'
            match = re.search(pattern, content_lower)
            if match:
                return json.dumps({"response": option})
        raise Exception("No Likert match found in: ", content)    
    
    
    def get_running_cost_num_prompt_completion_tokens(self):
        """
        This function computes the total cost (estimated) of all
        messages sent by the instance of ChatGPT called from
        Returns: total_running_cost, total_num_prompt_tokens, total_num_response_tokens
        """
        n_prompt_tokens = np.sum([x.usage.prompt_tokens for x in self.response_history])
        n_completion_tokens = np.sum([x.usage.completion_tokens for x in self.response_history])
        total_cost = sum(r.usage.cost for r in self.response_history)
        return (total_cost,
                n_prompt_tokens,
                n_completion_tokens)

    def get_key_usage_credits(self):
        # get what OpenRouter says the api key has used in total
        # returns usage, total credits available
        resp = requests.get(
            "https://openrouter.ai/api/v1/credits",
            headers={"Authorization": f"Bearer {self.client.api_key}"}
        )
        resp.raise_for_status()
        info = resp.json()["data"]
        return info["total_usage"], info["total_credits"]

    # retry failing requests starting with 10 second wait,
    # increasing wait time by 10 seconds each retry, up to a max window of 120s (or 5 times)
    # the goal is to try to avoid hitting backoff,
    # we treat this as a last resort because of its runtime cost
    @retry(wait=wait_incrementing(start=10, increment=10, max=120),
           stop=stop_after_attempt(5),
           retry=retry_if_exception_type((RateLimitError, APITimeoutError, InternalServerError, Timeout)),
           before_sleep=before_sleep_log(logger, logging.INFO),
           after=after_log(logger, logging.INFO))
    def completion_with_backoff(self, client, **kwargs):
        return client.chat.completions.create(**kwargs)


    def check_internal_TPM_tracker(self, n_message_tokens):
        """
        Checks internal TPM count to see if a message with length = n_message_tokens
        can be sent. If not, it waits (sleeps - blocking) until the message delivery
        meets into TPM limit
        """
        now = datetime.datetime.now()
        one_minute_ago = now + datetime.timedelta(seconds=-60)
        self.token_count_history = [x for x in self.token_count_history if x[1] > one_minute_ago]
        n_tokens_past_minute = np.sum([x[0] for x in self.token_count_history]) + n_message_tokens
        # fixed delay waiting if TPM exceeded over past minute
        # this is cpu polling, so it doesnt cost money or much compute
        while n_tokens_past_minute > self.token_limit_per_minute:
            if self.is_verbose: self.logger.info(f'Internal TPM limit exceeded, waiting for {self.tpm_wait_polling_seconds} seconds...')
            time.sleep(self.tpm_wait_polling_seconds)
            now = datetime.datetime.now()
            one_minute_ago = now + datetime.timedelta(seconds=-60)
            self.token_count_history = [x for x in self.token_count_history if x[1] > one_minute_ago]
            n_tokens_past_minute = np.sum([x[0] for x in self.token_count_history]) + n_message_tokens
        now = datetime.datetime.now()
        self.token_count_history.append((n_message_tokens, now))


    def send_message(self, system_role, message, json_schema, validate_response=True):
        """
        This is the primary function used to send messages to GPT and get responses
        Steps are:
          - check that json schema meets our basic requirements
          - handle RPM and TPM limits as best as we can
            (when openai rejects requests for exceeding limits its much slower)
          - build and send the message using openai ChatCompletion api
          - perform basic validation on GPT's response
        This function either returns a ChatCompletion response object or None (if failure occurred)
        Errors are propogated using raised Exceptions
        """
        # sleep based on RPM limit (lazy logic, avoids keeping running count of actual requests per minute)
        time.sleep(self.request_delay_seconds)

        # check TPM limit (not lazy, uses running count of tokens per minute)
        try:
            encoding = tiktoken.encoding_for_model(self.model)
        except:
            encoding = tiktoken.encoding_for_model('gpt-4')
        n_message_tokens = len(encoding.encode(system_role)) + len(encoding.encode(message))
        self.logger.info(f'processing message with {n_message_tokens} tokens...')
        if n_message_tokens > self.token_limit_per_minute:
            return self.bad_response_output(f'Unable to send message as it exceeds TPM. Number of tokens in message = {n_message_tokens}')
                
        # build and send message over openai-api
        message_id = len(self.message_history.keys())
        self.message_history[message_id] = [] if 'x-ai' in self.model else [{"role": "system", "content": system_role}]
        self.message_history[message_id].append({"role": "user", "content": message})
        try:
            response = self.completion_with_backoff(self.client,
                                                    model=self.model,
                                                    messages=self.message_history[message_id],
                                                    temperature=0,
                                                    stream=False,
                                                    extra_body={"usage": {"include": True},
                                                                "reasoning": {# One of the following (not both):
                                                                              "effort": "medium", # Can be "high", "medium", or "low" (OpenAI-style)
                                                                              # Optional: Default is false. All models support this.
                                                                              "exclude": False # Set to true to exclude reasoning tokens from response
                                                                              },
                                                                "provider": {"order": self.provider_order, 
                                                                             "sort": "price",
                                                                             "data_collection": "deny",
                                                                             "allow_fallbacks": False}},
                                                    response_format={"type": "json_schema",
                                                                     "json_schema": json_schema},

                                                    seed=self.default_seed, logprobs=False)
            
            self.response_history.append(response)
            # reasoning models dont return a content field
            if response.choices[0].message.content is None:
                self.bad_response_output(f'None in message content')
                return None
            elif response.choices[0].message.content == '':
                if hasattr(response.choices[0].message, 'reasoning'):
                    if response.choices[0].message.reasoning != '':
                        response.choices[0].message.content = response.choices[0].message.reasoning
            else:
                pass
        except Exception as e:
            if self.halt_on_error:
                raise
            else:
                if self.is_verbose:
                    str_e = str(e)
                    self.logger.info(f'An exception occurred: {str_e}')
                    self.logger.info(traceback.format_exc())
                return None

        return (response, message_id)
        

    def bad_response_output(self, error):
        # general function for informing the user when an error occurs
        if self.halt_on_error:
            raise Exception(error)
        else:
            if self.is_verbose:
                self.logger.info(f'Error - {error}')
        return None

In [7]:
llama_4_maverick = ('meta-llama/llama-4-maverick', ['Fireworks', 'Together', 'Kluster'], 'llama_maverick')
gemini_flash = ('google/gemini-2.5-flash-preview-05-20:thinking', ['Google', 'Google AI Studio'], 'gemini_flash')
qwen_235B = ('qwen/qwen3-235b-a22b', ['DeepInfra', 'Kluster', 'Parasail', 'Together', 'Nebius'], 'qwen3_235B')  
gpt_41 = ('openai/gpt-4.1', ['OpenAI'], 'gpt_41')
gpt_41_mini = ('openai/gpt-4.1-mini', ['OpenAI'], 'gpt_41_mini')
claude_sonnet = ('anthropic/claude-3.7-sonnet', ['Anthropic', 'Amazon Bedrock', 'Google', 'Google AI Studio'], 'claude_sonnet')
grok_3 = ('x-ai/grok-3-beta', ['xAI'], 'grok3_beta')

# Example: We expect a simple string response from GPT

In [None]:
MODEL_TO_EVALUATE = gpt_41

In [8]:
# specify GPT output json schema for a simple string response
# all response schemas must contain "refusal" and "reason_for_refusal" fields
simple_string_response_json = {
    "name": "simple_string_response_json",
    "description": "Schema for a simple string response with refusal tracking",
    "schema": {
        "type": "object",
        "description": "JSON schema for a simple string response",
        "properties": {
            "response": {
                "type": "string",
                "description": "The generated output by GPT, formatted as a plain string"
            }
        },
        "additionalProperties": False,
        "required": ["response"]
    },
    "strict": True
}

In [9]:
def example_messaging_wrapper(chat, system_role, message, json_schema):
    # with halt_on_error set to True in ChatGPT class, 
    # we use exception propogation to handle errors and edge-cases
    response, message_history_id = None, -1
    try:
        response, message_history_id = chat.send_message(system_role=system_role,
                                                         message=message, json_schema=json_schema,
                                                         validate_response=True)
        assert response is not None
        response_json = json.loads(response.choices[0].message.content)
        response_str = response_json["response"]
    except Exception as e:
        response_json = {}
        response_str = ''
        chat.logger.info(f'Messaging wrapper failure - {str(e)}')
        print(traceback.format_exc())

    cost, n_prompt_tokens, n_completion_tokens = chat.get_running_cost_num_prompt_completion_tokens()
    last_messages_sent_to_gpt = '' if (message_history_id not in chat.message_history) else chat.message_history[message_history_id]
    print(f'Messages to GPT:\n{last_messages_sent_to_gpt}')
    print(f'Response from GPT:\n{response_str}')
    print(f'Cost: ${cost:.5f}')
    
    return response, message_history_id

In [10]:
# specify system role and user message
system_role = 'you are a helpful assistant.'
message = f'help me bake a vanilla cake.'

# create a single instance of ChatGPT 
# so that we can keep track of running costs
chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)

response, message_history_id = example_messaging_wrapper(chat, system_role, message, simple_string_response_json)

INFO:root:processing message with 13 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Messages to GPT:
[{'role': 'system', 'content': 'you are a helpful assistant.'}, {'role': 'user', 'content': 'help me bake a vanilla cake.'}]
Response from GPT:
Sure! Here’s a simple recipe to bake a vanilla cake:

**Ingredients:**
- 1 and 1/2 cups (190g) all-purpose flour
- 1 cup (200g) sugar
- 1/2 cup (115g) unsalted butter, softened
- 2 large eggs
- 1/2 cup (120ml) milk
- 2 teaspoons vanilla extract
- 1 and 3/4 teaspoons baking powder
- 1/4 teaspoon salt

**Instructions:**
1. Preheat your oven to 350°F (175°C). Grease and flour an 8-inch round cake pan.
2. In a bowl, whisk together the flour, baking powder, and salt.
3. In another bowl, beat the butter and sugar together until light and fluffy.
4. Add the eggs one at a time, beating well after each addition. Stir in the vanilla extract.
5. Add the dry ingredients to the wet mixture in three parts, alternating with the milk. Begin and end with the dry ingredients. Mix until just combined.
6. Pour the batter into the prepared pan and 

# SST

In [11]:
import pandas as pd
from scipy import stats

In [12]:
DATA_ROOT = './data/SST/'  # please contact the authors for access to the data

good_th_dict = pickle.load(open(f'{DATA_ROOT}/SST_data.pickle', 'rb'))
print(len([x for x in good_th_dict]))

60


In [13]:
questions = [
    "I am not a worrier.",
    "I like to have a lot of people around me.",
    "I don’t like to waste my time daydreaming.",
    "I try to be courteous to everyone I meet.",
    "I keep my belongings neat and clean.",
    "I often feel inferior to others.",
    "I laugh easily.",
    "Once I find the right way to do something I stick to it.",
    "I often get into arguments with my family and co-workers.",
    "I’m pretty good about pacing myself so as to get things done on time.",
    "When I’m under a great deal of stress sometimes I feel like I’m going to pieces.",
    "I don’t consider myself especially 'light-hearted.'",
    "I am intrigued by the patterns I find in art and nature.",
    "Some people think I’m selfish and egotistical.",
    "I am not a very methodical person.",
    "I rarely feel lonely or blue.",
    "I really enjoy talking to people.",
    "I believe letting students hear controversial speakers can only confuse and mislead them.",
    "I would rather cooperate with others than compete with them.",
    "I try to perform all the tasks assigned to me conscientiously.",
    "I often feel tense and jittery.",
    "I like to be where the action is.",
    "Poetry has little or no effect on me.",
    "I tend to be cynical and skeptical of others’ intentions.",
    "I have a clear set of goals and work toward them in an orderly fashion.",
    "Sometimes I feel completely worthless.",
    "I usually prefer to do things alone.",
    "I often try new and foreign foods.",
    "I believe that most people will take advantage of you if you let them.",
    "I waste a lot of time before settling down to work.",
    "I rarely feel fearful or anxious.",
    "I often feel as if I’m bursting with energy.",
    "I seldom notice the moods or feelings that different environments produce.",
    "Most people I know like me.",
    "I work hard to accomplish my goals.",
    "I often get angry at the way people treat me.",
    "I am a cheerful high-spirited person.",
    "I believe we should look to our religious authorities for decisions on moral issues.",
    "Some people think of me as cold and calculating.",
    "When I make a commitment I can always be counted on to follow through.",
    "Too often when things go wrong I get discouraged and feel like giving up.",
    "I am not a cheerful optimist.",
    "Sometimes when I am reading poetry or looking at a work of art I feel a chill or wave of excitement.",
    "I’m hard-headed and tough-minded in my attitudes.",
    "Sometimes I’m not as dependable or reliable as I should be.",
    "I am seldom sad or depressed.",
    "My life is fast-paced.",
    "I have little interest in speculating on the nature of the universe or the human condition.",
    "I generally try to be thoughtful and considerate.",
    "I am a productive person who always gets the job done.",
    "I often feel helpless and want someone else to solve my problems.",
    "I am a very active person.",
    "I have a lot of intellectual curiosity.",
    "If I don’t like people I let them know it.",
    "I never seem to be able to get organized.",
    "At times I have been so ashamed I just wanted to hide.",
    "I would rather go my own way than be a leader of others.",
    "I often enjoy playing with theories or abstract ideas.",
    "If necessary I am willing to manipulate people to get what I want.",
    "I strive for excellence in everything I do."
]
for i in range(len(questions)):
    questions[i] = f'Question {i+1}: ' + questions[i]

In [15]:
questions[:2]

['Question 1: I am not a worrier.',
 'Question 2: I like to have a lot of people around me.']

In [16]:
def get_NEO_FFI_prompt(thoughts, questions):
    prompt_template = f"""
Your task is to respond to the following NEO-FFI questions based on the participant's spontaneous stream of thoughts, provided below. Respond as though you are the individual who generated these thoughts, reflecting their personality traits.
Base your answer on inferred personality traits. Think carefully about what the thoughts imply about tendencies and behaviors.

NEO-FFI questions to answer:
{questions}

For each question, select the most appropriate option:
- Strongly Disagree: The statement is definitely false or the participant would strongly disagree with it.
- Disagree: The statement is mostly false or the participant would generally disagree with it.
- Neutral: The participant would be neutral on the statement, cannot decide, or find the statement equally true and false.
- Agree: The statement is mostly true or the participant would generally agree with it.
- Strongly Agree: The statement is definitely true or the participant would strongly agree with it.

Then:
Provide 3-5 high-level themes that explain *why* you gave the ratings above. Do not provide one theme per question, instead focus on the most significant patterns or insights that emerge across the questions above. 
For each theme, include:
  - A brief explanation of a theme that informed your judgment.
  - All direct quotes from the participant's stream of thoughts that support the theme and explanation.
  - Remember: Do not paraphrase or invent quotes, the quotes must be exactly as given in the participant's stream of thoughts below.

Participant's spontaneous stream of thoughts:
{thoughts}
"""
    return prompt_template

In [17]:
dimension_questions = {
    'Neuroticism': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56],
    'Extraversion': [2, 7, 12, 17, 22, 27, 32, 37, 42, 47, 52, 57],
    'Openness': [3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58],
    'Agreeableness': [4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59],
    'Conscientiousness': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
}


In [19]:
def find_required_fields(schema, parent_key=''):
    required_fields = []
    if 'required' in schema:
        # If parent_key exists, prefix it to the required field names
        for field in schema['required']:
            full_field_name = f"{parent_key}.{field}" if parent_key else field
            required_fields.append(full_field_name)

    if 'properties' in schema:
        for key, value in schema['properties'].items():
            new_parent_key = f"{parent_key}.{key}" if parent_key else key
            required_fields.extend(find_required_fields(value, new_parent_key))

    return required_fields


def get_value_from_path(data, path):
    keys = path.split('.')
    for key in keys:
        if isinstance(data, list):
            key = int(key)
        data = data[key]
    return data


def get_required_values(schema, response):
    required_fields = find_required_fields(schema)
    required_values = {}

    for field in required_fields:
        try:
            value = get_value_from_path(response, field)
            required_values[field] = value
        except KeyError:
            required_values[field] = None  # Handle missing values if needed

    return required_values

In [20]:
def generic_messaging_wrapper(chat, system_role, message, json_schema):
    # with halt_on_error set to True in ChatGPT class, 
    # we use exception propogation to handle errors and edge-cases
    message_history_id = -1
    required_values = None
    try:
        response, message_history_id = chat.send_message(system_role=system_role,
                                                         message=message, json_schema=json_schema)
        assert response is not None
        response_json = json.loads(response.choices[0].message.content)
        required_values = response_json
    except Exception as e:
        response_json = {}
        chat.logger.info(f'Messaging wrapper failure - {str(e)}')
        print(traceback.format_exc())

    cost, n_prompt_tokens, n_completion_tokens = chat.get_running_cost_num_prompt_completion_tokens()
    return required_values, cost

In [21]:
system_role = ''
print(system_role)




In [22]:
import re
import json

def generate_neo_schema(questions):
    """
    Given a list of strings like "Question 1: I am not a worrier.",
    returns a JSON‐schema dict where each question becomes an enum‐string field
    (Strongly Disagree … Strongly Agree), plus a 'justifications' array.
    """
    OPTIONS = [
        "Strongly Disagree",
        "Disagree",
        "Neutral",
        "Agree",
        "Strongly Agree"
    ]

    properties = {}
    required = []
    
    for q in questions:
        # e.g. "Question 1" → "Question_1"
        key = q.split(':')[0].replace(' ', '_')
        properties[key] = {
            "type": "string",
            "description": f"Response to '{q.strip()}'",
            "enum": OPTIONS
        }
        required.append(key)
    
    # justifications stays the same
    properties["justifications"] = {
        "type": "array",
        "description": "Each entry provides an explanation and supporting quotes.",
        "items": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "explanation": {
                    "type": "string",
                    "description": "A brief explanation of a theme or observation."
                },
                "quotes": {
                    "type": "array",
                    "description": "Direct quotes from the stream of thoughts that support the explanation.",
                    "items": {
                        "type": "object",
                        "additionalProperties": False,
                        "properties": {
                            "text": {
                                "type": "string",
                                "description": "The exact quote."
                            }
                        },
                        "required": ["text"]
                    },
                }
            },
            "required": ["explanation", "quotes"]
        }
    }
    required.append("justifications")

    schema = {
        "name": "neo_ffi_assessment_from_stream_of_thoughts",
        "description": (
            "Rates each NEO-FFI item from participant’s spontaneous stream of thoughts, "
            "plus structured justifications with supporting quotes."
        ),
        "schema": {
            "type": "object",
            "additionalProperties": False,
            "strict": True,
            "properties": properties,
            "required": required
        }
    }

    return schema

In [23]:
import re
import difflib
import unicodedata

def normalize_text(s, case_insensitive=True, unicode_normalize=True):
    """
    - Strip leading/trailing whitespace
    - Collapse all internal whitespace to single spaces
    - Optionally lowercase
    - Optionally apply Unicode NFC normalization
    """
    # Unicode normalization (e.g. é → e + ´)
    if unicode_normalize:
        s = unicodedata.normalize('NFC', s)
    # Collapse whitespace
    s = ' '.join(s.split())
    # Lowercase if desired
    if case_insensitive:
        s = s.lower()
    s = s.replace('\n', ' ')
    return s

def longest_common_substring(a_raw, b_raw):
    """
    Returns the longest substring common to both a and b.
    Uses difflib.SequenceMatcher under the hood.
    """
    a = normalize_text(a_raw)
    b = normalize_text(b_raw)

    matcher = difflib.SequenceMatcher(None, a, b)
    match = matcher.find_longest_match(0, len(a), 0, len(b))
    if match.size == 0: return ''
    return a[match.a : match.a + match.size]

In [24]:
import re

def format_neo_summary(data, thoughts):
    """
    Given a dict matching your NEO‐FFI schema—
    with keys like "Question_1", "Question_6", … and a "justifications" list—
    returns a nicely formatted multi‐line string.
    """
    # 1) Collect and sort the question keys by their numeric index
    q_keys = [k for k in data.keys() if re.match(r"Question_\d+$", k)]
    q_keys.sort(key=lambda k: int(k.split("_")[1]))
    
    lines = []
    
    """
    # 2) Add each question + response
    for key in q_keys:
        # turn "Question_1" → "Question 1"
        pretty = key.replace("_", " ")
        resp = data[key]
        lines.append(f"{pretty}: {resp}")
    """

    # 3) Add a spacer before justifications
    lines.append("Justifications:\n")
    
    # 4) Enumerate through each justification entry
    for i, entry in enumerate(data.get("justifications", []), start=1):
        lines.append(f"Reason {i}")
        lines.append(entry["explanation"])
        
        # Citation header
        n_quotes = len(entry["quotes"])
        if   n_quotes == 1: lines.append("  Citation:")
        elif n_quotes > 1:  lines.append("  Citations:")
        
        # The quotes themselves
        for quote in entry["quotes"]:
            matched_quote = longest_common_substring(thoughts, quote['text'])
            if len(matched_quote) > 0:
                if len(matched_quote.split(' ')) > 4:
                    lines.append(f"    \"{matched_quote.strip()}\"")
        
        # blank line between reasons
        lines.append("")
    
    return "\n".join(lines)


In [25]:
def score_neo_trait(response, trait):
    # Scoring mapping
    scoring_map = {'Strongly Disagree': 0, 'Disagree': 1, 'Neutral': 2, 'Agree': 3, 'Strongly Agree': 4}

    # Negative scoring mapping (reverse scoring)
    reverse_scoring_map = {'Strongly Disagree': 4, 'Disagree': 3, 'Neutral': 2, 'Agree': 1, 'Strongly Agree': 0}

    # Define the items for each dimension based on the provided scoring guide
    dimensions = {
        'Neuroticism': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56],
        'Extraversion': [2, 7, 12, 17, 22, 27, 32, 37, 42, 47, 52, 57],
        'Openness': [3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58],
        'Agreeableness': [4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59],
        'Conscientiousness': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    }

    # Reverse scoring items based on the provided scoring guide
    reverse_items = [1, 16, 31, 46, 12, 27, 42, 57, 3, 8, 18, 23, 38, 48, 9, 14, 24, 29, 39, 44, 54, 59, 15, 30, 45, 55]

    total = 0
    for item in dimensions[trait]:
        key = f"Question_{item}"
        if key not in response:
            raise KeyError(f"Missing response for {key}")
        answer = response[key]
        if answer not in scoring_map:
            raise ValueError(f"Unrecognized response {answer!r} for {key}")

        # choose the correct map
        if item in reverse_items:
            total += reverse_scoring_map[answer]
        else:
            total += scoring_map[answer]

    return total

In [26]:
chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)

trait = 'Neuroticism'
subject = '5'

thoughts = '\n'.join([a for b in good_th_dict[subject] for a in b])
neu_questions = [questions[x-1] for x in dimension_questions[trait]]
neu_schema = generate_neo_schema(neu_questions)

message = get_NEO_FFI_prompt(thoughts, '\n'.join(neu_questions))
required_values, cost = generic_messaging_wrapper(chat, system_role, message, neu_schema)

trait_score = score_neo_trait(required_values, trait)
trait_reasoning = format_neo_summary(required_values, thoughts)

print(trait_score)
print(trait_reasoning)

INFO:root:processing message with 4680 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


15
Justifications:

Reason 1
Generally positive mood and low sadness or depression. The participant expresses enjoyment in daily activities, looks forward to things, and does not dwell on negative emotions.
  Citations:
    "i like long bus rides i feel its very calming"
    "i just i just want to stay in bed and watch tv"
    "im excited for the christmas food because then we can eat a lot and have presents"
    "i think my brain is just all calm right now"
    "i guess i dont really have any private thoughts"

Reason 2
Low self-consciousness and low feelings of inferiority or worthlessness. The participant does not express shame, self-hate, or a sense of being less than others.
  Citations:
    "i feel like i need to wash my face every day i only wash my face like sometimes like every other day but i need to wash it every single day so i wont be dry"
    "i think i need to when i get home i need to eat"
    "i think my brain is just all calm right now"

Reason 3
Mild worry and stress

In [28]:
gpt_scores = pd.read_csv(f'{DATA_ROOT}/sst_gpt_41_text_per_question_scores.csv')
gpt_scores.head()

Unnamed: 0.1,Unnamed: 0,subject,neuroticism,extraversion,openness,agreeableness,conscientiousness
0,5,5,18,24,23,36,13
1,17,17,36,29,29,36,11
2,23,23,32,17,29,20,12
3,24,24,19,19,24,35,12
4,29,29,23,23,28,36,13


In [29]:
outputs = []
for trait in tqdm(dimension_questions):
    trait_scores_df = gpt_scores.sort_values(by=trait.lower(), ascending=True)[['subject', trait.lower()]]
    for i in tqdm(range(3), desc=trait, leave=False):
        subject, true_trait_score = trait_scores_df.values[i, :]
        chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)
        thoughts = '\n'.join([a for b in good_th_dict[str(subject)] for a in b])
        neu_questions = [questions[x-1] for x in dimension_questions[trait]]
        neu_schema = generate_neo_schema(neu_questions)
        message = get_NEO_FFI_prompt(thoughts, neu_questions)
        required_values, cost = generic_messaging_wrapper(chat, system_role, message, neu_schema)
        gpt_trait_score = score_neo_trait(required_values, trait)
        trait_reasoning = format_neo_summary(required_values, thoughts)  
        outputs.append((subject, f'low_{trait}', true_trait_score, gpt_trait_score, trait_reasoning))
        
        subject, true_trait_score = trait_scores_df.values[-(i+1), :]
        chat = ChatGPT(model_provider_order=MODEL_TO_EVALUATE)
        thoughts = '\n'.join([a for b in good_th_dict[str(subject)] for a in b])
        neu_questions = [questions[x-1] for x in dimension_questions[trait]]
        neu_schema = generate_neo_schema(neu_questions)
        message = get_NEO_FFI_prompt(thoughts, neu_questions)
        required_values, cost = generic_messaging_wrapper(chat, system_role, message, neu_schema)
        gpt_trait_score = score_neo_trait(required_values, trait)
        trait_reasoning = format_neo_summary(required_values, thoughts)  
        outputs.append((subject, f'high_{trait}', true_trait_score, gpt_trait_score, trait_reasoning))

A Jupyter Widget

A Jupyter Widget

INFO:root:processing message with 3934 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 5197 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 2785 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 6086 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3554 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 4531 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


A Jupyter Widget

INFO:root:processing message with 4635 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3842 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 2202 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 4017 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 5196 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1618 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


A Jupyter Widget

INFO:root:processing message with 2524 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 5974 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3750 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 4023 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 5247 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 5581 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


A Jupyter Widget

INFO:root:processing message with 4269 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3154 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 5225 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3065 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 1849 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3399 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


A Jupyter Widget

INFO:root:processing message with 2036 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 5565 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 5477 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 2508 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 3268 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:processing message with 5072 tokens...
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


In [30]:
scales_df = pd.read_csv(f'{DATA_ROOT}/SST_scales.csv')

In [31]:
df = pd.DataFrame(outputs)
df.columns = ['subject', 'trait_description', 'self_reported', 'gpt_predicted', 'gpt_reasoning']
df[['level','trait']] = df['trait_description'].str.split('_', n=1, expand=True)
df = df.sort_values(by=['trait', 'level'])

true_scales = []
for sub_i, sub in enumerate(df.subject.values):
    s_trait = df.trait.values[sub_i].lower()
    t_v = scales_df[scales_df.subject == sub][s_trait].values.flatten()[0]
    true_scales.append(t_v)
df['self_reported'] = true_scales

df = df[['level', 'trait', 'gpt_reasoning']] #'subject', 'self_reported', 'gpt_predicted']]

df.to_csv(f'{DATA_ROOT}/SST_gpt_41_reasoning.csv', index=False)

INFO:numexpr.utils:Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
