In [None]:
!pip install --upgrade openai pandas tqdm matplotlib seaborn

In [None]:
import os, time, json
import pandas as pd
from tqdm import tqdm
from getpass import getpass
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

drive.mount('/content/drive')

In [None]:
INPUT_CSV_PATH = "/inputPath/FileName"
OUTPUT_FOLDER = "/outputPath/FileName"
ID_COL = "id"
TEXT_COL = "selftext"

MODEL_NAME = "gpt-4o"
DELAY_SECONDS = 0.6
MAX_RETRIES = 6
BACKOFF_BASE = 2.0

In [None]:
PROMPT_TEMPLATE = """You are an expert medical text annotator specializing in Alzheimer's and dementia. You will be given a Reddit post (with a unique "id") about a patient with dementia. Extract STRUCTURED INFORMATION about the PATIENT only (ignore the caregiver/author). If multiple patients are mentioned, focus on the MAIN patient.


Extraction Guidelines:


Age:
- If an exact age is mentioned, return it as a string (e.g., "72").
- If an age range or approximate is given, return that (e.g., "70s", "mid-60s").
- If no age info, return "unknown".
- Do not infer age from context or clues.


Gender:
- If a clear indicator exists (pronouns he/him/his or she/her/hers, or unambiguous family terms like father/mother, brother/sister, etc.), return "male" or "female".
- If ambiguous or gender-neutral, return "unknown".
- Do not guess gender from names, roles, or stereotypes; do not use caregiver's gender.


Symptoms:
- Identify all symptoms or cognitive/behavioral changes explicitly attributed to the patient. Ignore general comments and caregiver feelings.
- For each symptom described:
  - If it matches a definition in the symptom dictionary below, add the exact symptom name (with correct capitalization) to known_symptoms.
  - If it does not match any definition, create a concise term from the description and add it to rare_symptoms. (Use a precise noun or phrase; e.g., if seeing things not there, use "Hallucinations".)
- Avoid duplicates: List each symptom only once (treat "Hallucination" and "hallucinations" as the same symptom).
- Do not hallucinate: Only extract symptoms actually mentioned; do not invent or infer any others.
- If no symptoms are mentioned, set both known_symptoms and rare_symptoms to empty lists ([]).

CRITICAL INSTRUCTION - STRICT MATCHING REQUIRED

A symptom can ONLY be tagged as "known_symptoms" if it matches EXACTLY (word-for-word) with one of the 29 symptom names in the dictionary below.

STRICT RULES:
- Use the EXACT symptom name from the list (including capitalization and spacing)
- DO NOT use synonyms (e.g., "Hallucinations" is NOT in the list, so tag as rare)
- DO NOT paraphrase or reword symptoms
- DO NOT infer similar meanings (e.g., "Forgetfulness" ≠ "Forgetting Recent Information")
- DO NOT add symptoms not explicitly listed
- If you're unsure or the symptom seems similar but isn't exact → tag as "rare_symptoms"

Before tagging as "known_symptoms":
1. Read the exact symptom text from the post
2. Search for an EXACT match in the 29-item dictionary below
3. If found → use exact name; If not found → create concise term for rare_symptoms


STANDARD ALZHEIMER'S SYMPTOM DICTIONARY (with Definitions):


1. Repetitive Speech: Repeat statements and questions over and over.
2. Forgetting Recent Information: Forget conversations, appointments or events.
3. Misplacing Objects: Misplace items, often putting them in places that don't make sense.
4. Spatial Disorientation: Trouble with visual and spatial abilities, like getting lost while driving, get lost in places they used to know well.
5. Name Recall Issues: Forget the names of family members and everyday objects.
6. Communication Difficulties: Have trouble finding the right words, expressing thoughts or having conversations. Difficulty speaking, finding words, or following conversations.
7. Memory Loss: Everyone has trouble with memory at times, but the memory loss related to Alzheimer's disease is lasting. Over time, memory loss affects the ability to function at work and at home.
8. Concentration and Thinking Problems: Trouble concentrating and thinking, especially about abstract concepts such as numbers.
9. Multitasking Challenges: Doing more than one task at once is especially hard.
10. Financial Management Issues: Challenging to manage finances, balance checkbooks and pay bills on time.
11. Numerical Recognition Loss: Eventually may not recognize numbers.
12. Decision-Making Difficulties: Hard to make sensible decisions and judgments.
13. Inappropriate Choices: Make poor choices in social settings or wear clothes for the wrong type of weather.
14. Reasoning/Problem-Solving Issues: Difficulty thinking through problems or making decisions. Everyday problems may be hard to solve.
15. Emergency Response Problems: May not know how to handle food burning on the stove or how to make decisions when driving.
16. Routine/Complex Task Challenges: Routine activities that involve completing steps in a certain order can be hard. Trouble performing tasks with multiple steps.
17. Planning and Organization Difficulties: Difficulty managing time, schedules, or daily activities. Trouble planning and cooking a meal or playing a favorite game.
18. Basic Skill Loss: As Alzheimer's disease becomes advanced, forget how to do basic tasks such as dressing and bathing.
19. Changes in Personality and Behavior: Brain changes that occur in Alzheimer's disease can affect moods and behaviors.
20. Depression: Feeling sad, hopeless.
21. Apathy: Loss of interest in activities.
22. Social Withdrawal: Avoiding people and usual activities.
23. Mood Swings: Experiencing rapid emotional changes, such as sudden anger or frustration.
24. Trust Issues: Not trusting others.
25. Aggressive Behavior: Anger or aggression.
26. Sleep Disturbances: Changes in sleeping habits.
27. Wandering: Walking aimlessly, often getting lost.
28. Loss of Inhibitions: Doing or saying things unusual for the person, like inappropriate comments.
29. Delusions: False beliefs, such as believing something has been stolen when it hasn't.

ALLOWED KNOWN SYMPTOMS (EXACT NAMES ONLY):
Use ONLY these exact strings for known_symptoms:
["Repetitive Speech", "Forgetting Recent Information", "Misplacing Objects", "Spatial Disorientation", "Name Recall Issues", "Communication Difficulties", "Memory Loss", "Concentration and Thinking Problems", "Multitasking Challenges", "Financial Management Issues", "Numerical Recognition Loss", "Decision-Making Difficulties", "Inappropriate Choices", "Reasoning/Problem-Solving Issues", "Emergency Response Problems", "Routine/Complex Task Challenges", "Planning and Organization Difficulties", "Basic Skill Loss", "Changes in Personality and Behavior", "Depression", "Apathy", "Social Withdrawal", "Mood Swings", "Trust Issues", "Aggressive Behavior", "Sleep Disturbances", "Wandering", "Loss of Inhibitions", "Delusions"]


CAREGIVER MOTIVATIONS:
- In addition to patient information, identify the CAREGIVER'S MOTIVATIONS for posting this on Reddit.
- Match each identified action/intent against the dictionary (1-13) below. If it aligns with a definition, use the EXACT name (case-sensitive).
- If an action/intent does NOT match any dictionary definition, use "Other: [3-7 word description]" in lowercase.
- Multiple motivations allowed per post.
- Only extract motivations EXPLICITLY stated or CLEARLY IMPLIED—do not infer or guess.
- If no clear motivation exists, return empty list [].


STANDARD CAREGIVER MOTIVATION DICTIONARY:


1. Seeking_Informational_Support: Asking for advice, guidance, or information about caregiving, medical issues, treatments, or resources.
2. Seeking_Emotional_Support: Expressing feelings, seeking empathy, understanding, or validation of emotions.
3. Community_Building: Expressing gratitude to the community, thanking others, or connecting with others in similar situations.
4. Venting: Expressing frustration, anger, or stress without explicitly asking for advice.
5. Sharing_Experience: Telling personal stories or providing updates without asking questions.
6. Seeking_Tactical_Strategies: Asking for specific day-to-day management tips or practical coping strategies.
7. Navigating_Systems: Seeking help with healthcare, insurance, legal, financial, or institutional systems.
8. Seeking_Validation: Seeking affirmation they're doing the right thing or that their feelings are valid.
9. Concerns_About_Care_Quality: Expressing concerns about abuse, neglect, or poor quality care in facilities.
10. Seeking_Belonging: Expressing isolation and seeking to feel part of a community.
11. Understanding_Symptoms: Asking about specific symptoms, behaviors, or disease progression.
12. Awareness_Raising: Sharing information to educate others or advocate for better care/policies.
13. Celebrating_Positive_Moments: Sharing small victories, positive moments, or good days.
14. Other: Any motivation not captured by the above categories. When using this, provide a brief description in the format "Other: [description]".


OUTPUT FORMAT
Return your answer ONLY in this strict JSON format (no additional commentary, no extra text, no explanations, no deviations—output MUST be valid JSON and exactly match this schema):
{{
"id": "<copy the EXACT id provided with the post>",
"age": "...",
"gender": "...",
"known_symptoms": ["...", "..."],
"rare_symptoms": ["...", "..."],
"motivations": ["...", "..."]
}}


- Copy the id exactly as provided.
- Always return "unknown" for age/gender if missing.
- Always return an empty list [] if there are no symptoms.
- Always return an empty list [] if no clear motivations can be identified.
- Use EXACT motivation names from the dictionary (case-sensitive).
- For non-standard motivations, use format: "Other: [brief description]" (e.g., "Other: seeking respite care recommendations").
- Keep "Other" descriptions concise (3-7 words) and lowercase after the colon.
- Do not add extra fields or text.


RULES: Follow instructions exactly. Do NOT infer, guess, or add any information not explicitly stated in the post. Extract patient details (age, gender, symptoms) and caregiver motivations separately. For motivations, prioritize matching to the dictionary; only use "Other:" when truly necessary. Ensure the JSON is valid and follows the schema above.


Post ID: {post_id}
Post text: '''{post_text}'''
"""


In [None]:
Path(OUTPUT_FOLDER).mkdir(parents=True, exist_ok=True)

sample_df = pd.read_csv(INPUT_CSV_PATH, dtype=str)
print(f"Loaded {len(sample_df):,} sample posts from: {INPUT_CSV_PATH}")
print(sample_df[[ID_COL, TEXT_COL]].head(3))

In [None]:
from openai import OpenAI

api_key = getpass("Paste your OpenAI API key (it will not be shown): ")
os.environ["OPENAI_API_KEY"] = api_key
client = OpenAI(api_key=api_key)
print("OpenAI client ready.")

In [None]:
def call_model_with_backoff(prompt_text):
    attempt = 0
    while True:
        try:
            resp = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that analyzes Reddit posts and returns only valid JSON."},
                    {"role": "user", "content": prompt_text}
                ],
                temperature=0.1,
                max_tokens=400
            )
            out_text = resp.choices[0].message.content
            return out_text, resp
        except Exception as e:
            attempt += 1
            if attempt > MAX_RETRIES:
                raise RuntimeError(f"Max retries exceeded. Last error: {e}")
            wait = BACKOFF_BASE ** attempt
            print(f"Call error: {e}. Backing off {wait:.1f}s (attempt {attempt})")
            time.sleep(wait)

In [None]:
RAW_JSONL_PATH = os.path.join(OUTPUT_FOLDER, "raw_responses.jsonl")
PARSED_CSV_PATH = os.path.join(OUTPUT_FOLDER, "extracted_results.csv")

processed_ids = set()
if os.path.exists(PARSED_CSV_PATH):
    done_df = pd.read_csv(PARSED_CSV_PATH, dtype=str)
    processed_ids = set(done_df["id"].astype(str).tolist())
    print(f"Resuming: {len(processed_ids)} posts already processed.")

parsed_results = []
SAVE_EVERY = 25
count = 0

with open(RAW_JSONL_PATH, "a", encoding="utf-8") as raw_f:
    for i, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Processing posts"):
        post_id = str(row[ID_COL])
        if post_id in processed_ids:
            continue

        post_text = str(row[TEXT_COL])
        prompt = PROMPT_TEMPLATE.format(post_id=post_id, post_text=post_text)

        try:
            model_text, raw_resp_obj = call_model_with_backoff(prompt)
        except Exception as e:
            fail_row = {
                "id": post_id,
                "model_text": None,
                "age": None,
                "gender": None,
                "known_symptoms": None,
                "rare_symptoms": None,
                "motivations": None,
                "parse_error": f"Model call failed: {e}"
            }
            parsed_results.append(fail_row)
            raw_entry = {"id": post_id, "input_text": post_text, "model_reply": None, "error": str(e)}
            raw_f.write(json.dumps(raw_entry, ensure_ascii=False) + "\n")
            raw_f.flush()
            continue

        raw_entry = {"id": post_id, "input_text": post_text, "model_reply": model_text}
        raw_f.write(json.dumps(raw_entry, ensure_ascii=False) + "\n")
        raw_f.flush()

        parsed = {
            "id": post_id,
            "model_text": model_text,
            "age": None,
            "gender": None,
            "known_symptoms": None,
            "rare_symptoms": None,
            "motivations": None,
            "parse_error": None
        }

        try:
            js_start = model_text.find("{")
            js_end = model_text.rfind("}")
            if js_start == -1 or js_end == -1 or js_end <= js_start:
                raise ValueError("No JSON object found in model output")

            json_str = model_text[js_start:js_end+1]
            parsed_json = json.loads(json_str)

            parsed["age"] = parsed_json.get("age")
            parsed["gender"] = parsed_json.get("gender")
            parsed["known_symptoms"] = json.dumps(parsed_json.get("known_symptoms", []))
            parsed["rare_symptoms"] = json.dumps(parsed_json.get("rare_symptoms", []))
            parsed["motivations"] = json.dumps(parsed_json.get("motivations", []))
        except Exception as e:
            parsed["parse_error"] = str(e)

        parsed_results.append(parsed)
        count += 1

        if (count % SAVE_EVERY) == 0:
            if os.path.exists(PARSED_CSV_PATH):
                existing = pd.read_csv(PARSED_CSV_PATH, dtype=str)
                combined = pd.concat([existing, pd.DataFrame(parsed_results)], ignore_index=True)
            else:
                combined = pd.DataFrame(parsed_results)
            combined.to_csv(PARSED_CSV_PATH, index=False)
            print(f"Saved progress: {len(combined)} rows -> {PARSED_CSV_PATH}")
            parsed_results = []
        time.sleep(DELAY_SECONDS)

if parsed_results:
    if os.path.exists(PARSED_CSV_PATH):
        existing = pd.read_csv(PARSED_CSV_PATH, dtype=str)
        combined = pd.concat([existing, pd.DataFrame(parsed_results)], ignore_index=True)
    else:
        combined = pd.DataFrame(parsed_results)
    combined.to_csv(PARSED_CSV_PATH, index=False)
    print(f"Final save: {len(combined)} rows -> {PARSED_CSV_PATH}")

print("Extraction complete. Raw responses and parsed CSV saved.")