#add GPT API key here 

In [None]:
import pandas as pd
import json, re
from openai import OpenAI

client = OpenAI()

# ========= ÂÆûÈ™åÂèÇÊï∞ =========
TEXT_COL = "selftext"
N_SAMPLE = 80          # ÈÄâÊã©ÂÆûÈ™åÊù°Êï∞
MIN_WORDS = 75
MAX_WORDS = 500

THEMATIC_DEV_ROUNDS = 2   # Phase 1‚Äì2 Â§öËΩÆÊ¨°Êï∞
APPLY_BATCH_SIZE = 8      # Phase 6 batch size
MODEL = "gpt-5-mini"

def run_thematic_analysis_with_checkpoint(
    csv_path,
    save_path,
    text_column="selftext",
    dev_n=50,
    apply_batch_size=5,
    min_words=10,
    max_words=600,
):
    import os
    import json
    import re
    import pandas as pd
    from collections import defaultdict
    from crewai import Agent, Task, Crew, Process

    # =====================================================
    # Utility functions
    # =====================================================

    def parse_json(text, default=None):
        if default is None:
            default = []
        if not text:
            return default
        text = text.strip()
        text = re.sub(r"^```json", "", text)
        text = re.sub(r"```$", "", text)
        starts = [i for i in [text.find("{"), text.find("[")] if i != -1]
        if not starts:
            return default
        first = min(starts)
        last = max(text.rfind("}"), text.rfind("]")) + 1
        try:
            return json.loads(text[first:last])
        except Exception:
            return default

    def filter_by_word_count(text):
        if not isinstance(text, str):
            return None
        words = text.split()
        if len(words) < min_words:
            return None
        if len(words) > max_words:
            words = words[:max_words]
        return " ".join(words)

    def split_long_post(text, max_chars=1500):
        if not isinstance(text, str):
            return []
        paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
        chunks, current = [], ""
        for p in paragraphs:
            if len(current) + len(p) <= max_chars:
                current = current + " " + p if current else p
            else:
                chunks.append(current)
                current = p
        if current:
            chunks.append(current)
        return chunks

    def aggregate_codes_to_post(segment_codes):
        post_codes = defaultdict(set)
        for item in segment_codes:
            if not isinstance(item, dict):
                continue
            pid = item.get("post_id")
            codes = item.get("codes")
            if pid is None or not isinstance(codes, list):
                continue
            for c in codes:
                c = str(c).strip()
                if c:
                    post_codes[str(pid)].add(c)
        return [
            {"post_id": pid, "codes": sorted(list(codes))}
            for pid, codes in post_codes.items()
        ]

    # =====================================================
    # Load data
    # =====================================================

    df = pd.read_csv(csv_path)
    #df=df.head(120)
    if text_column not in df.columns:
        raise ValueError(f"CSV ÂøÖÈ°ªÂåÖÂê´Âàó: {text_column}")

    df = df[df[text_column].notna()]
    df[text_column] = df[text_column].astype(str)

    df["post_id"] = df.index.astype(str)
    df["posts"] = df[text_column]

    print(f"üì• Loaded {len(df)} raw posts")

    # =====================================================
    # Phase 1‚Äì5: exploratory multi-agent theme construction
    # =====================================================

    dev_df = df.sample(min(dev_n, len(df)), random_state=42)

    dev_segments = []
    for row in dev_df.itertuples():
        for i, part in enumerate(split_long_post(row.posts)):
            dev_segments.append({
                "segment_id": f"{row.post_id}_seg{i}",
                "post_id": row.post_id,
                "text": part
            })

    familiarizer = Agent(
        role="Familiarization Agent",
        goal="Read text segments and write analytic memos.",
        backstory="You are a qualitative researcher immersing yourself in raw text.",
        llm="gpt-5-mini",
        allow_delegation=False,
    )

    coder = Agent(
        role="Coding Agent",
        goal="Generate semantic codes.",
        backstory="You perform open and reflexive qualitative coding.",
        llm="gpt-5-mini",
        allow_delegation=False,
    )

    theme_builder = Agent(
        role="Theme Builder",
        goal="Develop and define themes.",
        backstory="You synthesize codes into coherent thematic structures.",
        llm="gpt-5-mini",
        allow_delegation=False,
    )

    t1 = Task(
        description=f"""
PHASE 1 ‚Äì Familiarization
Write analytic memos and patterns.

Segments:
{json.dumps(dev_segments, ensure_ascii=False)}
""",
        expected_output="JSON memo.",
        agent=familiarizer,
    )

    t2 = Task(
        description=f"""
PHASE 2 ‚Äì Coding
Generate 1‚Äì3 semantic codes per segment.

Return JSON:
[
  {{ "segment_id": "...", "post_id": "...", "codes": ["..."] }}
]

Segments:
{json.dumps(dev_segments, ensure_ascii=False)}
""",
        expected_output="JSON codes.",
        agent=coder,
    )

    Crew(
        agents=[familiarizer, coder],
        tasks=[t1, t2],
        process=Process.sequential,
    ).kickoff()

    segment_codes = parse_json(t2.output.raw)
    post_codes = aggregate_codes_to_post(segment_codes)

    t3 = Task(
        description=f"""
PHASE 3‚Äì5 ‚Äì Theme construction
Develop final themes from codes.

Codes:
{json.dumps(post_codes, ensure_ascii=False)}

Return JSON:
{{ "themes": [{{"theme_name": "...", "definition": "..."}}] }}
""",
        expected_output="JSON themebook.",
        agent=theme_builder,
    )

    Crew(
        agents=[theme_builder],
        tasks=[t3],
        process=Process.sequential,
    ).kickoff()

    themebook = parse_json(t3.output.raw)
    themes = themebook.get("themes", [])
    allowed_themes = {t["theme_name"] for t in themes}

    theme_summary = [
        {"theme_name": t["theme_name"], "definition": t["definition"]}
        for t in themes
    ]

    print(f"‚úî Themebook ready: {len(theme_summary)} themes")

    # =====================================================
    # Phase 6: frozen theme application (execution agent)
    # =====================================================

    filtered_posts = []
    for row in df.itertuples():
        filtered = filter_by_word_count(row.posts)
        if filtered is not None:
            filtered_posts.append({
                "post_id": row.post_id,
                "text": filtered
            })

    print(f"üöÄ Posts after word filter: {len(filtered_posts)}")

    results = []

    for i in range(0, len(filtered_posts), apply_batch_size):
        batch = filtered_posts[i:i + apply_batch_size]

        applier = Agent(
            role="Theme Application Agent",
            goal="Apply fixed themes without reinterpretation.",
            backstory="Themes are final; you only assign and extract evidence.",
            llm="gpt-5-mini",
            allow_delegation=False,
        )

        t_apply = Task(
            description=f"""
PHASE 6 ‚Äì THEME APPLICATION (FROZEN)

Rules:
- Use ONLY the provided themebook
- Assign EXACTLY ONE primary theme per post
- Extract 1‚Äì3 short evidence phrases
- Do NOT revise themes
- Output STRICT JSON only

Return JSON:
[
  {{
    "post_id": "...",
    "primary_theme": "...",
    "label_phrases": ["...", "..."]
  }}
]

Themebook:
{json.dumps(theme_summary, ensure_ascii=False)}

Posts:
{json.dumps(batch, ensure_ascii=False)}
""",
            expected_output="Strict JSON.",
            agent=applier,
        )

        Crew(
            agents=[applier],
            tasks=[t_apply],
            process=Process.sequential,
            verbose=False,
        ).kickoff()

        raw = parse_json(t_apply.output.raw, default=[])

        for item in raw:
            if not isinstance(item, dict):
                continue
            pid = item.get("post_id")
            theme = item.get("primary_theme")
            phrases = item.get("label_phrases", [])
            if theme not in allowed_themes:
                theme = "Other/Unclear"
            results.append({
                "post_id": pid,
                "primary_theme": theme,
                "label_phrases": phrases
            })

        print(f"‚úî Saved batch {i // apply_batch_size + 1}")

    # =====================================================
    # Merge back to original df
    # =====================================================

    labels_df = pd.DataFrame(results)
    final_df = df.merge(labels_df, on="post_id", how="left")

    final_save_path = save_path.replace(".csv", "_full.csv")
    final_df.to_csv(final_save_path, index=False)

    print(f"üì¶ Final merged file saved to: {final_save_path}")
    print("‚úÖ Pipeline completed successfully.")


In [None]:
run_thematic_analysis_with_checkpoint(
    csv_path="/AI addiction/merged_subreddit_dataset_clean.csv",
    save_path="/AI addiction/ALLthematic_clean.csv",
    text_column="selftext",
    dev_n=100,
    apply_batch_size=20,
)
