In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle

from openai import OpenAI
import time

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
here = '1_Method/'

## Helpers

In [None]:
from collections import defaultdict
import spacy

from nltk.corpus import wordnet as wn
nlp = spacy.load("en_core_web_sm")

def _is_adjective(word: str) -> bool:
    if not isinstance(word, str):
        return False
    return any(ss.pos() in ("a", "s") for ss in wn.synsets(word))


def _group_by_lemma(words):
    """
    Group adjectives by their lemma.
    Skips multi-word phrases & any with punct.
    """

    lemma_groups = defaultdict(list)
    for word in words:
        doc = nlp(word)
        if doc and len(doc) > 0:
            lemma = doc[0].lemma_
            lemma_groups[lemma].append(word)
    return lemma_groups

    return dict(groups)

def _group_by_prefix(words):
    """
    Group adjectives by their first 4 characters.
    Skips multi-word phrases & any with punct.
    """
    groups = defaultdict(list)

    for w in words:
        key = w[:4]  # first 4 characters
        groups[key].append(w)

    return dict(groups)

from typing import Dict, List

def _pack_groups(groups: Dict[str, List[str]], pack_size: int = 5) -> List[List[str]]:
    """
    Return a list of lists where:
      - any group with >1 word is kept as-is
      - all singleton groups are merged into new lists of length `pack_size`
        (the final one may be shorter)
    """
    result: List[List[str]] = []
    singletons: List[str] = []

    for key, words in groups.items():
        if len(words) > 1:
            result.append(words)
        elif len(words) == 1:
            singletons.append(words[0])

    # Pack the singletons
    for i in range(0, len(singletons), pack_size):
        result.append(singletons[i:i + pack_size])

    return result

def preprocess(words):
  groups = []
  for w in words:
    w = w.lower()
    if len(w.split()) > 1 and not w.isalpha() and not _is_adjective(w):
        continue
    groups.append(w)

  groups = _group_by_lemma(groups)
  return groups

## Load Candidate Adjectives

In [None]:
# Vader & Evaluative Lexicon
vader = pd.read_csv(here + "data/dictionary/vader_lexicon.txt", sep="\t", header=None)
el = pd.read_excel(here + "data/dictionary/evaluative_lexicon.xlsx")

# Theory driven words
panas = 'interested, distressed, Excited, upset, strong, guilty, scared, hostile, enthusiastic, proud, irritable, alert, ashamed, inspired, nervous, determined, attentive, jittery, active, afraid'
spane = 'positive, good, pleasant, joy, happy, contented, negative, bad, unpleasant, sad, afraid, angry'

panas = panas.split(', ')
spane = spane.split(', ')

# gls
gls = """
Interesting
Reflective
Bored
Dull
Unstimulated

Happy
Comfortable
Pleasant
Upbeat
Sad
Unhappy
Unpleasant

Fulfilled
Coherent
Meaningless
Pointless

Dramatic
Engaging
Uneventful
Monotonous

Enjoyable
Unstable

Disorganized
Meaningful
Purposeful
Virtuous
"""
gls = gls.split('\n')
gls = [g.strip() for g in gls if g.strip()]

prlq = """
Intense
Unique
Unusual
Advanterous
"""
prlq = prlq.split('\n')
prlq = [p.strip() for p in prlq if p.strip()]

In [None]:
def _get_words(df):
      # keep only alphabetic words
    df = df[df.iloc[:, 0].apply(lambda x: str(x).isalpha())].copy()
    df['pos'] = df.iloc[:, 0].apply(_is_adjective)

    # get adjective words
    words = df[df['pos']].iloc[:, 0].tolist()

    return words

In [None]:
vader_words = _get_words(vader)
el_words = _get_words(el)

new_words = vader_words + el_words + gls + prlq + spane + panas
new_words = list(set([w.lower() for w in new_words]))

In [None]:
with open(here + 'data/Adjectives.pkl', 'wb') as f:
  pickle.dump(new_words, f)

## Filter

In [None]:
with open(here + 'data/Adjectives.pkl', 'rb') as f:
    adjectives = pickle.load(f)

In [None]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def prompt(adj_list):
    return f"""
You will evaluate adjectives for whether they naturally describe a person's life when used in
"I want to live a ___ life" or "I don't want to live a ___ life."

Given: {adj_list}

Rules:
- Return ONLY a comma-separated list of valid adjectives in lowercase, e.g., "word x, word y, word z".
- If the words share a lemma, use BASE FORM (positive degree, lemma). No superlatives (e.g., use "happy" not "happier/happiest").
- Each item must be a single adjective (no nouns, verbs, or multi-word terms) and it may include a natural hyphen.
- Do not merge or remove the hyphen. Keep it as-is if it is a natural English adjective.
- Must sound natural when used in a sentence right before the word "life". You should skip the adjectives
    that do not make much sense when used right before the word "life" as a life descriptor.
- If none are suitable, return "none".
"""

In [None]:
processed = {}

for word in tqdm(adjectives):
  prompt = prompt(word)

  response = client.responses.create(
        model="gpt-5-nano",
        input= prompt
    )

  print(prompt)
  print(response.output_text)

  processed[word] = response.output_text

In [None]:
with open(here + 'data/Adjective_Filtered.pkl', 'wb') as f:
    pickle.dump(processed, f)