In [2]:
!pip install Empath

Collecting Empath
  Downloading empath-0.89.tar.gz (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: Empath
  Building wheel for Empath (setup.py) ... [?25l[?25hdone
  Created wheel for Empath: filename=empath-0.89-py3-none-any.whl size=57799 sha256=3bec16ba00ccbbca4df87a28e6eaf18b20cdeb93195d0085f3acc2bbb239e169
  Stored in directory: /root/.cache/pip/wheels/b5/e9/20/019c1afcddedf93646169dcd5a99a28ca74ee6cf6ad23ccbdf
Successfully built Empath
Installing collected packages: Empath
Successfully installed Empath-0.89


In [3]:
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
from empath import Empath

# Paths (adjust if your file is elsewhere)
INPUT_PATH = Path("beyondb_post_predictions.csv")
OUTPUT_PATH = Path("beyondb_first50_empath_reasons.csv")

# Column with text
TEXT_COL = "cleaned_post_content"

# Optional: column with top emotion (if you have one)
EMOTION_COL = "pred_emotions"   # set to None if you don't want to use it

# Only analyze first N posts
MAX_ROWS = 50

lexicon = Empath()


In [4]:
# Map groups of empath categories to high-level "reason" labels
REASON_CATEGORY_GROUPS = {
    "relationship_issues": ["family", "friends", "children", "affection"],
    "social_isolation": ["friends", "neglect", "shame", "envy"],
    "work_or_study_stress": ["work", "office", "job", "school"],
    "financial_stress": ["money"],
    "health_problems": ["health", "medical_emergency"],
    "home_or_living_situation": ["home"],
    "conflict_or_abuse": ["conflict", "dispute", "violence", "crime"],
    "grief_or_loss": ["death"],
    "general_negative_emotion": ["sadness", "anger", "fear"],
    "online_or_technology_issues": ["internet"],
}

# Flatten all categories used
ALL_REASON_CATS = {c for cats in REASON_CATEGORY_GROUPS.values() for c in cats}


In [5]:
def infer_reason_labels(text: str, top_n: int = 3):
    """
    Use Empath to get scores, then aggregate them into high-level reason labels.
    Returns a list of reason labels sorted by score.
    """
    if not isinstance(text, str) or not text.strip():
        return []

    scores = lexicon.analyze(text, normalize=True)

    # Aggregate scores by our high-level groups
    group_scores = {label: 0.0 for label in REASON_CATEGORY_GROUPS}
    for label, cats in REASON_CATEGORY_GROUPS.items():
        group_scores[label] = sum(scores.get(cat, 0.0) for cat in cats)

    # Drop groups with zero score
    group_scores = {k: v for k, v in group_scores.items() if v > 0}

    if not group_scores:
        return []

    # Sort by score descending and take top_n
    ordered = sorted(group_scores.items(), key=lambda kv: kv[1], reverse=True)
    top = [label for label, score in ordered[:top_n]]
    return top


def build_reason_explanation(text: str, reasons: list[str], emotion: str | None = None) -> str:
    """
    Simple human-readable explanation of the inferred reasons.
    """
    if not reasons:
        if emotion:
            return (
                f"The post expresses the emotion '{emotion}', but Empath did not "
                "strongly match any specific topical reason categories."
            )
        else:
            return (
                "Empath did not strongly match any specific topical reason "
                "categories for this post."
            )

    reason_str = ", ".join(reasons).replace("_", " ")
    if emotion:
        return (
            f"The post is associated with the emotion '{emotion}'. "
            f"Empath found language related to: {reason_str}. "
            "These themes may represent possible reasons behind how the person feels."
        )
    else:
        return (
            f"Empath found language related to: {reason_str}. "
            "These themes may represent possible reasons behind the person's emotional state."
        )


In [6]:
def run_empath_reason_extraction():
    print(f"Reading input CSV from: {INPUT_PATH}")
    df = pd.read_csv(INPUT_PATH)

    if TEXT_COL not in df.columns:
        raise ValueError(f"Column '{TEXT_COL}' not found in {INPUT_PATH}")

    df = df.head(MAX_ROWS).copy()
    print(f"Processing first {len(df)} posts.")

    df[TEXT_COL] = df[TEXT_COL].fillna("")

    has_emotion = EMOTION_COL is not None and EMOTION_COL in df.columns
    if has_emotion:
        df[EMOTION_COL] = df[EMOTION_COL].fillna("")

    reason_labels_list = []
    reason_explanations = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = str(row[TEXT_COL])
        emotion = str(row[EMOTION_COL]) if has_emotion else None

        reasons = infer_reason_labels(text, top_n=3)
        explanation = build_reason_explanation(text, reasons, emotion)

        reason_labels_list.append(reasons)
        reason_explanations.append(explanation)

    df["empath_reason_labels"] = reason_labels_list
    df["empath_reason_explanation"] = reason_explanations

    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUTPUT_PATH, index=False)
    print(f"Saved output with reasons to: {OUTPUT_PATH}")


run_empath_reason_extraction()


Reading input CSV from: beyondb_post_predictions.csv
Processing first 50 posts.


  0%|          | 0/50 [00:00<?, ?it/s]

Saved output with reasons to: beyondb_first50_empath_reasons.csv
