In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/mental-health'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
import re
from typing import List, Dict, Any

# ====================================================================
# STEP 1: DATA LOADING, CONSOLIDATION, AND ROBUST CLEANING
# ====================================================================

# 1.1 Define the list of file paths (ensure these paths are correct in your Kaggle environment)
file_paths = [
    '/kaggle/input/mental-health/ptsd.csv',
    '/kaggle/input/mental-health/adhd.csv',
    '/kaggle/input/mental-health/aspergers.csv',
    '/kaggle/input/mental-health/ocd.csv',
    '/kaggle/input/mental-health/depression.csv'
]

TEXT_COL = 'body' # The column containing the main message

all_data: List[pd.DataFrame] = []
print("--- Starting Data Loading and Consolidation ---")

for path in file_paths:
    try:
        # Load the CSV
        df = pd.read_csv(path)
        # Add a source column
        df['source'] = path.split('/')[-1].replace('.csv', '')
        
        # Standardize the text column name
        if TEXT_COL not in df.columns:
            print(f"Warning: '{TEXT_COL}' not found in {path}. Skipping this file.")
            continue
            
        df.rename(columns={TEXT_COL: 'text_raw'}, inplace=True)
        all_data.append(df)
        print(f"Loaded {path} with {len(df)} initial rows.")
        
    except FileNotFoundError:
        print(f"File not found: {path}")
    except Exception as e:
        print(f"Error loading {path}: {e}")

if not all_data:
    raise ValueError("No data files were loaded successfully. Please check your Kaggle file paths.")

df_combined = pd.concat(all_data, ignore_index=True)
print(f"\nTotal rows after consolidation: {len(df_combined)}")

# 1.2 Robust Cleaning and Null/Empty Body Removal (Crucial step based on your instruction)
initial_rows = len(df_combined)

# Convert to string and handle standard missing values (NaN, None)
df_combined['text_raw'] = df_combined['text_raw'].astype(str)

# Identify rows where the body is Null, empty, or just whitespace
df_combined.replace('', np.nan, inplace=True) # Replace empty strings with NaN
df_combined.dropna(subset=['text_raw'], inplace=True) # Drop NaN values

# Remove rows where the content is just whitespace
df_combined = df_combined[df_combined['text_raw'].str.strip().astype(bool)]

final_rows = len(df_combined)
print(f"Rows removed due to null/empty body: {initial_rows - final_rows}")
print(f"Final rows for analysis: {final_rows}")


# 1.3 & 1.4 General Preprocessing (Cleaning and Lowercasing)
def clean_text(text: str) -> str:
    """Performs basic cleaning and lowercasing."""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'@\w+', '', text) # Remove mentions/user handles
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation and numbers (keeping only letters and spaces)
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

df_combined['text_cleaned'] = df_combined['text_raw'].apply(clean_text)
df_combined['tokens'] = df_combined['text_cleaned'].apply(lambda x: x.split())

# 1.5 Placeholder for Target Variable (Sentiment) - You need to define this!
# Since your original data only has disorder labels, you must create a sentiment label.
# For simplicity, let's assume ALL posts are 'Negative' (1) and we will only look 
# for 'Positive' (0) if there are explicitly supportive/recovery-focused posts later.
# For now, we will use a binary classification.

df_combined['sentiment_label'] = 1 # Default to Negative (1)
# NOTE: If your data contains explicit labels (e.g., 'Positive', 'Neutral', 'Negative') use that column.
# If you need to include 'Neutral' (2), you must have a way to identify it. 


print("\n--- Sample of Cleaned Data ---")
print(df_combined[['source', 'text_raw', 'text_cleaned', 'sentiment_label']].head())

# Save the resulting DataFrame for the next step (ECR Module)
df_combined.to_csv('cleaned_mental_health_data.csv', index=False)
print("\nCleaned data saved to 'cleaned_mental_health_data.csv'.")

--- Starting Data Loading and Consolidation ---
Loaded /kaggle/input/mental-health/ptsd.csv with 24028 initial rows.
Loaded /kaggle/input/mental-health/adhd.csv with 37109 initial rows.
Loaded /kaggle/input/mental-health/aspergers.csv with 23294 initial rows.
Loaded /kaggle/input/mental-health/ocd.csv with 42826 initial rows.
Loaded /kaggle/input/mental-health/depression.csv with 24031 initial rows.

Total rows after consolidation: 151288
Rows removed due to null/empty body: 0
Final rows for analysis: 151288

--- Sample of Cleaned Data ---
  source                                           text_raw  \
0   ptsd  This year felt like literal hell. It’s over no...   
1   ptsd  Can feel my skin tightening up as I type this ...   
2   ptsd  I shout at my animals sometimes when they do s...   
3   ptsd  I'm really struggling with my past and it's pr...   
4   ptsd                                   On Snapchat call   

                                        text_cleaned  sentiment_label  
0  

In [3]:
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Tuple

# ====================================================================
# STEP 2.1: LOAD CLEANED DATA FROM STEP 1
# ====================================================================

try:
    df_combined = pd.read_csv('cleaned_mental_health_data.csv')
    # Convert tokens back from string (how CSV saves lists) to actual lists
    df_combined['tokens'] = df_combined['tokens'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    print(f"Loaded data for ECR from 'cleaned_mental_health_data.csv' with {len(df_combined)} rows.")
except FileNotFoundError:
    print("Error: 'cleaned_mental_health_data.csv' not found. Please ensure Step 1 was run successfully.")
    # Exit or use a mock DataFrame if needed for demonstration
    raise

# ====================================================================
# STEP 2.2: DEFINE ECR COMPONENTS (EDD AND FUNCTIONS)
# ====================================================================

# 2.2.1: Define the Emotion Dimension Dictionary (EDD) with Intensity Scores
# Scores are numerical intensity values, used for ES calculation.
EDD = {
    'Desirable': {'hope': 0.8, 'recovery': 0.9, 'better': 0.7, 'progress': 0.7, 'support': 0.8, 'relief': 0.6, 'calm': 0.6, 'managed': 0.7, 'accepting': 0.5, 'proud': 0.9},
    'Undesirable': {'anxiety': -0.9, 'fear': -0.9, 'hopeless': -1.0, 'struggle': -0.7, 'overwhelmed': -0.8, 'guilt': -0.7, 'pining': -0.6, 'depressed': -1.0, 'suicidal': -1.0, 'panic': -0.9, 'trauma': -0.8},
    'Praiseworthy': {'selfcompassion': 0.9, 'advocating': 0.8, 'acceptinghelp': 0.9, 'kind': 0.7, 'brave': 0.8, 'trying': 0.6},
    'Blameworthy': {'selfhate': -1.0, 'selfblame': -0.9, 'avoidance': -0.8, 'stigma': -0.9, 'fail': -0.8, 'wrong': -0.7},
    'Confirmed': {'validated': 0.7, 'working': 0.6, 'confirmed': 0.5, 'true': 0.5, 'diagnosis': 0.4},
    'Disconfirmed': {'misdiagnosed': -0.7, 'notworking': -0.8, 'relapse': -0.9, 'failed': -0.8, 'misunderstood': -0.7}
}

# 2.2.2: ECR Functions (Simplified OCC Rules)
def perform_ecr(text_tokens: List[str], EDD: Dict[str, Dict[str, float]]) -> Tuple[List[Any], List[Any]]:
    """Applies the simplified 10 OCC rules to infer emotion-cognitive knowledge (ECK)."""
    
    # Map tokens to their dimensions and intensity
    dim_map = {}
    for token in text_tokens:
        for dim, words in EDD.items():
            if token in words:
                dim_map[token] = {'dim': dim, 'intensity': words[token]}
                break

    PECK = []
    NECK = []
    
    def find_word_in_dim(dim_name: str, tokens_to_check: List[str]) -> str | None:
        """Helper to find the first token of a given dimension not yet used."""
        return next((token for token in tokens_to_check if token in dim_map and dim_map[token]['dim'] == dim_name), None)

    unique_tokens = list(dim_map.keys())
    tokens_used = set()

    # --- Compound Rules (Rules 5-10) - Higher Priority ---
    
    # Compound checks require multiple tokens; simplify by iterating over all unique tokens for potential pairings.
    
    # R6: Undesirable + Blameworthy -> Anger/Reproach (NECK)
    u_token = find_word_in_dim('Undesirable', unique_tokens)
    b_token = find_word_in_dim('Blameworthy', unique_tokens)
    if u_token and b_token and u_token not in tokens_used and b_token not in tokens_used:
        NECK.append([u_token, b_token, 'Anger/Self-Reproach'])
        tokens_used.add(u_token)
        tokens_used.add(b_token)

    # R5: Desirable + Praiseworthy -> Gratitude/Pride (PECK)
    d_token = find_word_in_dim('Desirable', unique_tokens)
    p_token = find_word_in_dim('Praiseworthy', unique_tokens)
    if d_token and p_token and d_token not in tokens_used and p_token not in tokens_used:
        PECK.append([d_token, p_token, 'Gratitude/Pride'])
        tokens_used.add(d_token)
        tokens_used.add(p_token)

    # R7: Desirable + Confirmed -> Satisfaction (PECK)
    c_token = find_word_in_dim('Confirmed', unique_tokens)
    if d_token and c_token and d_token not in tokens_used and c_token not in tokens_used:
        PECK.append([d_token, c_token, 'Satisfaction'])
        tokens_used.add(d_token)
        tokens_used.add(c_token)
            
    # R8: Undesirable + Confirmed -> Fear-Confirmed (NECK)
    if u_token and c_token and u_token not in tokens_used and c_token not in tokens_used:
        NECK.append([u_token, c_token, 'Fear-Confirmed'])
        tokens_used.add(u_token)
        tokens_used.add(c_token)

    # R9: Desirable + Disconfirmed -> Relief (PECK)
    dc_token = find_word_in_dim('Disconfirmed', unique_tokens)
    if d_token and dc_token and d_token not in tokens_used and dc_token not in tokens_used:
        PECK.append([d_token, dc_token, 'Relief'])
        tokens_used.add(d_token)
        tokens_used.add(dc_token)

    # R10: Undesirable + Disconfirmed -> Disappointment (NECK)
    if u_token and dc_token and u_token not in tokens_used and dc_token not in tokens_used:
        NECK.append([u_token, dc_token, 'Disappointment'])
        tokens_used.add(u_token)
        tokens_used.add(dc_token)
        
    # --- Single Rules (Rules 1-4) - Fallback for unused tokens ---
    for token in unique_tokens:
        if token not in tokens_used:
            dim = dim_map[token]['dim']
            
            # R1: Desirable -> Joy
            if dim == 'Desirable':
                PECK.append([token, 'Joy'])
            # R2: Undesirable -> Distress
            elif dim == 'Undesirable':
                NECK.append([token, 'Distress'])
            # R3: Praiseworthy -> Admiration
            elif dim == 'Praiseworthy':
                PECK.append([token, 'Admiration'])
            # R4: Blameworthy -> Reproach
            elif dim == 'Blameworthy':
                NECK.append([token, 'Reproach'])
                
    return PECK, NECK


# 2.2.3: Calculate ES and CS_ECR
def calculate_es_cs(text_tokens: List[str], EDD: Dict[str, Dict[str, float]]) -> Tuple[float, float]:
    """Calculates Emotion Score (ES) and ECR Confidence Score (CS_ECR) based on word intensity."""
    S_P = 0.0 # Positive Sentiment Intensity Sum
    S_N = 0.0 # Negative Sentiment Intensity Sum
    
    for token in text_tokens:
        for words in EDD.values():
            if token in words:
                intensity = words[token]
                if intensity > 0:
                    S_P += intensity
                elif intensity < 0:
                    # S_N is the sum of ABSOLUTE negative intensity (Eq. 12)
                    S_N += abs(intensity)
                break
                
    # ES (Eq. 13) and CS_ECR (Eq. 14)
    ES = (S_P - S_N) / (S_P + S_N) if (S_P + S_N) > 0 else 0.0
    CS_ECR = abs(ES)
    
    return ES, CS_ECR

# ====================================================================
# STEP 2.3: APPLY ECR TO THE DATA
# ====================================================================

print("\n--- Applying ECR to data (calculating ES, CS_ECR, PECK, NECK) ---")

# Apply ES/CS calculation
df_combined[['ES', 'CS_ECR']] = df_combined['tokens'].apply(
    lambda x: pd.Series(calculate_es_cs(x, EDD))
)

# Apply ECR rule-based knowledge extraction
df_combined[['PECK', 'NECK']] = df_combined['tokens'].apply(
    lambda x: pd.Series(perform_ecr(x, EDD))
)

# ====================================================================
# STEP 2.4: SAVE PROCESSED DATA FOR STEP 3
# ====================================================================

output_cols = ['source', 'text_raw', 'text_cleaned', 'tokens', 'sentiment_label', 'ES', 'CS_ECR', 'PECK', 'NECK']
print("\n--- Sample of ECR Processed Data ---")
print(df_combined[output_cols].head())

df_combined[output_cols].to_csv('ecr_processed_mental_health_data.csv', index=False)
print("\nProcessed data saved to 'ecr_processed_mental_health_data.csv'.")
print("Proceed to Step 3: Self-Adaptive Fusion (SAFA) and BERT Integration.")

Loaded data for ECR from 'cleaned_mental_health_data.csv' with 151288 rows.

--- Applying ECR to data (calculating ES, CS_ECR, PECK, NECK) ---

--- Sample of ECR Processed Data ---
  source                                           text_raw  \
0   ptsd  This year felt like literal hell. It’s over no...   
1   ptsd  Can feel my skin tightening up as I type this ...   
2   ptsd  I shout at my animals sometimes when they do s...   
3   ptsd  I'm really struggling with my past and it's pr...   
4   ptsd                                   On Snapchat call   

                                        text_cleaned  \
0  this year felt like literal hell its over now ...   
1  can feel my skin tightening up as i type this ...   
2  i shout at my animals sometimes when they do s...   
3  im really struggling with my past and its prob...   
4                                   on snapchat call   

                                              tokens  sentiment_label   ES  \
0  [this, year, felt, lik

In [4]:
import pandas as pd
import numpy as np
import re
from typing import List, Dict, Any, Tuple
from sklearn.preprocessing import MinMaxScaler

# ====================================================================
# STEP 3.1: LOAD ECR PROCESSED DATA FROM STEP 2
# ====================================================================

try:
    df_combined = pd.read_csv('ecr_processed_mental_health_data.csv')
    # Convert 'tokens', 'PECK', 'NECK' back from string to actual lists/objects
    df_combined['tokens'] = df_combined['tokens'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    df_combined['PECK'] = df_combined['PECK'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    df_combined['NECK'] = df_combined['NECK'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    print(f"Loaded ECR processed data for SAFA from 'ecr_processed_mental_health_data.csv' with {len(df_combined)} rows.")
except FileNotFoundError:
    print("Error: 'ecr_processed_mental_health_data.csv' not found. Please ensure Step 2 was run successfully.")
    raise

# ====================================================================
# STEP 3.2: SIMULATE BERT SENTIMENT PREDICTION (Placeholder for actual BERT)
# ====================================================================

# In a real scenario, you would train a BERT model on 'text_cleaned'
# to get sentiment_label_bert and CS_BERT.
# For now, we'll simulate it based on your existing 'sentiment_label'
# and generate a random confidence score.

def simulate_bert_prediction(sentiment_label: int, es_score: float) -> Tuple[int, float]:
    """
    Simulates BERT's sentiment prediction and confidence score.
    In a real scenario, this would be an actual BERT model.
    For this simulation:
    - Sentiment label is primarily based on the ECR's ES score to add some realism.
      If ES is strongly positive, BERT might lean positive.
      If ES is strongly negative, BERT might lean negative.
      Otherwise, it might be neutral or match the default.
    - Confidence score is semi-random, but higher if ES is strong.
    """
    # Simulate BERT's predicted label (ŷ)
    if es_score > 0.4: # Strongly positive ECR score
        y_bert = 0 # Positive
    elif es_score < -0.4: # Strongly negative ECR score
        y_bert = 1 # Negative
    else:
        y_bert = np.random.choice([0, 1, 2]) # Otherwise, randomly pick (0:Pos, 1:Neg, 2:Neu)

    # Simulate BERT's confidence score (CS_BERT)
    # Make it higher if ECR's ES is strong, implying more certainty.
    base_confidence = np.random.uniform(0.6, 0.9)
    if abs(es_score) > 0.5:
        cs_bert = min(0.99, base_confidence + np.random.uniform(0.05, 0.1))
    else:
        cs_bert = base_confidence

    return y_bert, cs_bert

print("\n--- Simulating BERT Sentiment Predictions ---")
df_combined[['sentiment_label_bert', 'CS_BERT']] = df_combined.apply(
    lambda row: pd.Series(simulate_bert_prediction(row['sentiment_label'], row['ES'])),
    axis=1
)

# 2.2.4: Normalize CS_BERT (Eq. 19)
# Use MinMaxScaler to scale CS_BERT to the range [0, 1]
scaler = MinMaxScaler(feature_range=(0, 1))
df_combined['CS_BERT_normalized'] = scaler.fit_transform(df_combined[['CS_BERT']])
print("CS_BERT normalized using Min-Max scaling.")

# ====================================================================
# STEP 3.3: SELF-ADAPTIVE FUSION ALGORITHM (SAFA - Algorithm 2)
# ====================================================================

# Algorithm 2: Self-Adaptive Fusion
# Input: OPOE, PECK, NECK, ES, CS_ECR, CS_BERT_normalized, threshold
# Output: SET

def generate_sentence_emotion_tree(original_text: str, pec_knowledge: List[Any], nec_knowledge: List[Any]) -> str:
    """
    Integrates selected PECK or NECK into the original text to form a Sentence-Emotion Tree (SET).
    This is a simplified representation of SET as a string.
    The paper uses a more complex tree structure for BERT's input, which will be handled
    by soft-position embedding and mask-transformer. Here we just embed it for textual representation.
    """
    if not pec_knowledge and not nec_knowledge:
        return original_text # No knowledge to embed

    embedded_knowledge = []
    if pec_knowledge:
        embedded_knowledge.append(f"{{PECK: {', '.join([str(item) for item in pec_knowledge])}}}")
    if nec_knowledge:
        embedded_knowledge.append(f"{{NECK: {', '.join([str(item) for item in nec_knowledge])}}}")

    # For simplicity, we'll append knowledge to the end of the text.
    # A more sophisticated approach would be to insert near relevant tokens.
    # Example from paper: "My wishes are that all medical staff are healthy! All the bastards {[disasters, anger], [damned, anger]} who cause disasters will be damned."
    # We'll just append to illustrate the concept.
    return f"{original_text} {' '.join(embedded_knowledge)}"


def self_adaptive_fusion(
    text_raw: str,
    pec_knowledge: List[Any],
    nec_knowledge: List[Any],
    es: float,
    cs_ecr: float,
    sentiment_label_bert: int, # BERT's predicted label (0:Pos, 1:Neg, 2:Neu)
    cs_bert_normalized: float,
    threshold: float = 0.3 # Hyperparameter, can be tuned (e.g., from ablation study)
) -> Tuple[str, List[Any], List[Any], str]:
    """
    Implements the Self-Adaptive Fusion Algorithm (Algorithm 2).
    Determines which knowledge (PECK/NECK) to incorporate based on confidence scores.
    Returns the Sentence-Emotion Tree (SET) and the chosen knowledge for later explanation.
    """
    chosen_pec = []
    chosen_nec = []
    fusion_reason = "No ECK incorporated"

    delta = cs_bert_normalized - cs_ecr # Eq. 20 (modified to use normalized CS_BERT)

    if cs_ecr >= threshold:
        # ECR has high priority/confidence
        if es > 0: # ECR suggests positive sentiment
            chosen_pec = pec_knowledge
            fusion_reason = "ECR-preferred PECK (high CS_ECR, positive ES)"
        elif es < 0: # ECR suggests negative sentiment
            chosen_nec = nec_knowledge
            fusion_reason = "ECR-preferred NECK (high CS_ECR, negative ES)"
        else:
            # ES is 0, ECR has no strong sentiment direction, no ECK incorporated
            pass # chosen_pec and chosen_nec remain empty
    else:
        # ECR has low priority/confidence, defer to BERT
        if delta >= 0: # BERT has higher or equal confidence
            if sentiment_label_bert == 0: # BERT predicts Positive
                chosen_pec = pec_knowledge
                fusion_reason = "BERT-preferred PECK (high CS_BERT, positive BERT)"
            elif sentiment_label_bert == 1: # BERT predicts Negative
                chosen_nec = nec_knowledge
                fusion_reason = "BERT-preferred NECK (high CS_BERT, negative BERT)"
            # If BERT predicts neutral (2), no ECK is incorporated by default
        else:
            # If delta < 0, ECR has higher confidence than BERT (unlikely given BERT's typical performance)
            # This branch is from the paper's Algorithm 2 line 12-15
            if es > 0:
                chosen_pec = pec_knowledge
                fusion_reason = "ECR-preferred PECK (low delta, positive ES)"
            elif es < 0:
                chosen_nec = nec_knowledge
                fusion_reason = "ECR-preferred NECK (low delta, negative ES)"
            # else ES is 0, no ECK

    set_text = generate_sentence_emotion_tree(text_raw, chosen_pec, chosen_nec)
    return set_text, chosen_pec, chosen_nec, fusion_reason

print("\n--- Applying Self-Adaptive Fusion Algorithm (SAFA) ---")
# Define the threshold for SAFA (this is a hyperparameter to tune)
safa_threshold = 0.3
print(f"Using SAFA threshold: {safa_threshold}")

df_combined[['SET', 'SAFA_PECK', 'SAFA_NECK', 'SAFA_Reason']] = df_combined.apply(
    lambda row: pd.Series(self_adaptive_fusion(
        row['text_raw'],
        row['PECK'],
        row['NECK'],
        row['ES'],
        row['CS_ECR'],
        row['sentiment_label_bert'],
        row['CS_BERT_normalized'],
        safa_threshold
    )),
    axis=1
)

# ====================================================================
# STEP 3.4: SAVE PROCESSED DATA FOR STEP 4 (BERT FEATURE REPRESENTATION)
# ====================================================================

output_cols = [
    'source', 'text_raw', 'text_cleaned', 'tokens', 'sentiment_label',
    'ES', 'CS_ECR', 'PECK', 'NECK',
    'sentiment_label_bert', 'CS_BERT', 'CS_BERT_normalized',
    'SET', 'SAFA_PECK', 'SAFA_NECK', 'SAFA_Reason'
]

print("\n--- Sample of SAFA Processed Data ---")
print(df_combined[output_cols].head())

df_combined[output_cols].to_csv('safa_processed_mental_health_data.csv', index=False)
print("\nProcessed data with SAFA results saved to 'safa_processed_mental_health_data.csv'.")
print("Proceed to Step 4: Knowledge-Enabled Feature Representation (Soft-Position Embedding & Mask-Transformer).")

Loaded ECR processed data for SAFA from 'ecr_processed_mental_health_data.csv' with 151288 rows.

--- Simulating BERT Sentiment Predictions ---
CS_BERT normalized using Min-Max scaling.

--- Applying Self-Adaptive Fusion Algorithm (SAFA) ---
Using SAFA threshold: 0.3

--- Sample of SAFA Processed Data ---
  source                                           text_raw  \
0   ptsd  This year felt like literal hell. It’s over no...   
1   ptsd  Can feel my skin tightening up as I type this ...   
2   ptsd  I shout at my animals sometimes when they do s...   
3   ptsd  I'm really struggling with my past and it's pr...   
4   ptsd                                   On Snapchat call   

                                        text_cleaned  \
0  this year felt like literal hell its over now ...   
1  can feel my skin tightening up as i type this ...   
2  i shout at my animals sometimes when they do s...   
3  im really struggling with my past and its prob...   
4                                 

In [6]:
import pandas as pd
import numpy as np
import re
from typing import List, Dict, Any, Tuple

# ====================================================================
# STEP 4.1: LOAD SAFA PROCESSED DATA FROM STEP 3
# ====================================================================

try:
    df_combined = pd.read_csv('safa_processed_mental_health_data.csv')
    # Convert 'tokens', 'PECK', 'NECK', 'SAFA_PECK', 'SAFA_NECK' back from string to actual lists/objects
    df_combined['tokens'] = df_combined['tokens'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    df_combined['PECK'] = df_combined['PECK'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    df_combined['NECK'] = df_combined['NECK'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    df_combined['SAFA_PECK'] = df_combined['SAFA_PECK'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    df_combined['SAFA_NECK'] = df_combined['SAFA_NECK'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    print(f"Loaded SAFA processed data for BERT integration from 'safa_processed_mental_health_data.csv' with {len(df_combined)} rows.")
except FileNotFoundError:
    print("Error: 'safa_processed_mental_health_data.csv' not found. Please ensure Step 3 was run successfully.")
    raise

# ====================================================================
# STEP 4.2: TOKENIZATION FOR SENTENCE-EMOTION TREES (SETs)
# ====================================================================

# The SETs are strings that look like: "original text {PECK: [[token, emotion]]} {NECK: [[token, emotion]]}"
# We need to tokenize this in a way that BERT can process it, and also track original token positions.

def tokenize_set_for_bert(set_text: str) -> Tuple[List[str], List[int], Dict[str, List[int]]]:
    """
    Tokenizes the SET text and generates conceptual soft-position embeddings.
    It also identifies which knowledge tokens correspond to which original word.
    
    Returns:
    - bert_tokens: List of tokens suitable for BERT input (e.g., ['[CLS]', 'my', 'wishes', '{', 'healthy', 'joy', '}', ...])
    - soft_positions: List of conceptual soft-position IDs for each bert_token.
    - knowledge_mapping: Dict showing which original word maps to which knowledge tokens' indices.
    """
    bert_tokens = ['[CLS]'] # Start token for BERT
    soft_positions = [0] # Position for [CLS]
    original_text_part = set_text
    
    # Extract embedded knowledge strings using regex
    knowledge_blocks = re.findall(r'\{\S+:\s*\[\[.*?\]\]\}|\{\S+:\s*\[\[.*?\]\].*?\[\[.*?\]\]\}', set_text)
    
    # Remove knowledge blocks from original_text_part to process original tokens
    for block in knowledge_blocks:
        original_text_part = original_text_part.replace(block, ' ')
        
    original_tokens = original_text_part.lower().split()
    
    current_pos_id = 1 # Starting position for actual words

    # Process original text tokens first
    token_to_pos_map = {} # To link original words to their soft positions
    for token in original_tokens:
        if token: # Ensure token is not empty
            bert_tokens.append(token)
            soft_positions.append(current_pos_id)
            token_to_pos_map[token] = current_pos_id # Store the position
            current_pos_id += 1
            
    # Process knowledge blocks and assign soft positions
    # Example: {PECK: [[healthy, Joy]]}
    # We need to parse the knowledge to link it to the word it refers to
    
    knowledge_mapping = {} # {'healthy': [idx_of_healthy_in_PECK, idx_of_Joy_in_PECK]}

    for block in knowledge_blocks:
        # Example block: {PECK: [[healthy, Joy]]} or {NECK: [[struggle, Distress], [anxiety, Distress]]}
        knowledge_type_match = re.match(r'\{(\S+):', block)
        knowledge_type = knowledge_type_match.group(1) if knowledge_type_match else 'UNKNOWN'

        # Extract actual knowledge tuples like [healthy, Joy]
        knowledge_tuples = re.findall(r'\[\[(.*?)\]\]', block) # Gets 'healthy, Joy'
        
        for tuple_str in knowledge_tuples:
            parts = [p.strip() for p in tuple_str.split(',')]
            if len(parts) >= 1: # At least the word is present
                # The first part of the tuple is usually the word it refers to
                referred_word = parts[0].lower() # e.g., 'healthy' or 'struggle'
                
                # If the referred word exists in the original text, use its position
                # Otherwise, assign a new position or just append.
                # The paper implies using the same position as the original word.
                pos_id_for_knowledge = token_to_pos_map.get(referred_word, current_pos_id)

                # Add knowledge tokens to bert_tokens and soft_positions
                for part in parts:
                    if part:
                        bert_tokens.append(part.lower())
                        soft_positions.append(pos_id_for_knowledge)
                        # Optionally, if this is a new position, increment current_pos_id
                        if pos_id_for_knowledge == current_pos_id:
                            current_pos_id += 1 # Only if it was a new, unlinked knowledge.
                
                # Store for conceptual masked attention if needed
                if referred_word not in knowledge_mapping:
                    knowledge_mapping[referred_word] = []
                # Store the range of indices for the knowledge tokens just added
                knowledge_mapping[referred_word].extend(
                    range(len(bert_tokens) - len(parts), len(bert_tokens))
                )
    
    bert_tokens.append('[SEP]') # End token for BERT
    soft_positions.append(current_pos_id) # Assign final position to SEP
    
    return bert_tokens, soft_positions, knowledge_mapping

print("\n--- Generating conceptual BERT tokens and Soft-Position Embeddings for SETs ---")
# Apply the function to a sample to demonstrate
sample_index = df_combined['SET'].astype(bool).idxmax() if not df_combined['SET'].empty else 0
sample_set = df_combined.loc[sample_index, 'SET']
sample_text_raw = df_combined.loc[sample_index, 'text_raw']

if sample_set:
    sample_bert_tokens, sample_soft_positions, sample_knowledge_map = tokenize_set_for_bert(sample_set)
    print(f"\nOriginal Text Sample:\n'{sample_text_raw}'")
    print(f"\nGenerated SET Sample:\n'{sample_set}'")
    print(f"\nConceptual BERT Input Tokens for SET:\n{sample_bert_tokens}")
    print(f"\nConceptual Soft-Position Embeddings:\n{sample_soft_positions}")
    print(f"\nConceptual Knowledge Mapping (word -> knowledge token indices):\n{sample_knowledge_map}")

    # Demonstrate position association
    print("\n--- Soft-Positioning Demonstration ---")
    for i in range(len(sample_bert_tokens)):
        print(f"Token: '{sample_bert_tokens[i]}', Soft-Position: {sample_soft_positions[i]}")

else:
    print("No non-empty SETs found to demonstrate tokenization and soft-positioning.")


# ====================================================================
# STEP 4.3: CONCEPTUAL MASK-TRANSFORMER ENCODER (Discussion)
# ====================================================================

print("\n--- Conceptual Mask-Transformer Encoder (Discussion) ---")
print("The Mask-Transformer Encoder modifies BERT's self-attention mechanism.")
print("It uses a 'visible matrix' (Mij, as per Equation 21 in the paper) to control")
print("which tokens can attend to which other tokens.")
print("\nKey Idea:")
print("1. Original Text Tokens: Can attend to all other original text tokens.")
print("2. Knowledge Tokens: E.g., 'Joy' from '[[healthy, Joy]]'")
print("   - Can attend to the original word it's associated with (e.g., 'healthy').")
print("   - Can attend to other knowledge tokens *within the same branch* (e.g., 'healthy' can attend to 'Joy').")
print("   - Are prevented from attending to *unrelated* original text tokens or knowledge from *different branches*.")
print("\nThis masking prevents irrelevant knowledge from influencing the representation")
print("of other parts of the sentence, thus mitigating knowledge noise and preserving")
print("the original meaning while still leveraging the enhanced features.")
print("\nImplementing this would require modifying the attention mask within a deep learning")
print("framework (PyTorch/TensorFlow) for a BERT-like model.")


# ====================================================================
# STEP 4.4: FINAL MODEL PREDICTION (Conceptual)
# ====================================================================

print("\n--- Conceptual Final Model Prediction ---")
print("After the SETs are processed by the knowledge-enabled BERT (with soft-positioning")
print("and mask-transformer), the model produces a rich feature representation for each SET.")
print("This feature representation is then passed to a linear classification layer.")
print("The linear layer (similar to Equation 15) uses the learned features to predict")
print("the final sentiment polarity (positive, negative, neutral) of the OPOE.")
print("\nAdditionally, the SAFA_PECK and SAFA_NECK components (stored in your DataFrame)")
print("serve as explainable emotion knowledge, detailing *how* the sentiment polarity")
print("was derived from fine-grained emotion categories.")

# For demonstration, we'll assign a final sentiment prediction based on the chosen SAFA knowledge
def final_sentiment_prediction_conceptual(safa_pec: List[Any], safa_nec: List[Any], bert_label: int) -> int:
    """
    Conceptual final sentiment prediction.
    In a real model, this would come from the linear classification layer
    after BERT processes the SET.
    Here, we'll try to reflect the SAFA decision.
    """
    if safa_pec and not safa_nec:
        return 0 # Positive (because positive knowledge was incorporated)
    elif safa_nec and not safa_pec:
        return 1 # Negative (because negative knowledge was incorporated)
    elif safa_pec and safa_nec: # Mixed knowledge, or no strong signal
        return bert_label # Fallback to BERT's initial prediction
    else: # No knowledge incorporated
        return bert_label # Fallback to BERT's initial prediction

print("\n--- Simulating Final Sentiment Labels based on SAFA Outcome ---")
df_combined['final_sentiment_predicted'] = df_combined.apply(
    lambda row: final_sentiment_prediction_conceptual(
        row['SAFA_PECK'], row['SAFA_NECK'], row['sentiment_label_bert']
    ),
    axis=1
)

# ====================================================================
# STEP 4.5: FINAL OUTPUT PREPARATION
# ====================================================================

final_output_cols = [
    'source', 'text_raw', 'sentiment_label',
    'sentiment_label_bert', 'final_sentiment_predicted',
    'SAFA_Reason', 'SAFA_PECK', 'SAFA_NECK', 'SET'
]

print("\n--- Final Conceptual Output Sample with Predicted Sentiment ---")
print(df_combined[final_output_cols].head())

df_combined[final_output_cols].to_csv('final_ecr_bert_conceptual_results.csv', index=False)
print("\nFinal conceptual results saved to 'final_ecr_bert_conceptual_results.csv'.")
print("\nThis concludes the conceptual implementation of the ECR-BERT pipeline.")
print("To move beyond conceptual demonstration, the next stage would involve actual deep learning framework implementation.")

Loaded SAFA processed data for BERT integration from 'safa_processed_mental_health_data.csv' with 151288 rows.

--- Generating conceptual BERT tokens and Soft-Position Embeddings for SETs ---

Original Text Sample:
'This year felt like literal hell. It’s over now and I’m happy it is but I’m so embarrassed about how I acted. I was living in a bug infested apartment sleeping on the floor, I couldn’t wash my hair or clothes for an embarrassing amount of time because I couldn’t afford shampoo or detergent because all of my my money was being taken by a pimp who set me up to get gang raped, would take videos without me knowing which the police ended up seeing, sold me against my will and was giving me amphetamines. I had no friends because he isolated me from them and I was constantly stressed out I went fucking insane. After he was in jail I was still living there and I would cry and scream and hit myself in the head and pull my hair out. I just wanted the images out of my fucking head and

In [7]:
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Tuple

# ====================================================================
# STEP 5.1: LOAD FINAL CONCEPTUAL RESULTS
# ====================================================================

try:
    df_results = pd.read_csv('final_ecr_bert_conceptual_results.csv')
    # Convert lists back from string representation
    df_results['SAFA_PECK'] = df_results['SAFA_PECK'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    df_results['SAFA_NECK'] = df_results['SAFA_NECK'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    print(f"Loaded final conceptual results for analysis from 'final_ecr_bert_conceptual_results.csv' with {len(df_results)} rows.")
except FileNotFoundError:
    print("Error: 'final_ecr_bert_conceptual_results.csv' not found. Please ensure Step 4 was run successfully.")
    raise

# ====================================================================
# STEP 5.2: QUALITATIVE ANALYSIS (Inspired by Table 6 in the paper)
# ====================================================================

print("\n--- QUALITATIVE ANALYSIS: Inferred Emotions and Sentiment Explanations ---")
print("Comparing original text, SAFA's chosen knowledge, and final predicted sentiment.")
print("The goal is to see how the SAFA_PECK/NECK provide an explanation for the sentiment.")

# Select a few diverse samples for qualitative review
# Try to pick samples where SAFA actually incorporated some knowledge.
sample_df = df_results[df_results['SAFA_PECK'].apply(bool) | df_results['SAFA_NECK'].apply(bool)].sample(n=5, random_state=42)

# If no samples with incorporated knowledge, pick random ones
if sample_df.empty:
    print("\nWarning: No samples found with incorporated PECK/NECK. Displaying general samples.")
    sample_df = df_results.sample(n=5, random_state=42)

# Sentiment mapping for readability
sentiment_map = {0: 'Positive', 1: 'Negative', 2: 'Neutral (Simulated)'}

for index, row in sample_df.iterrows():
    print(f"\n----- Sample {index} (Source: {row['source'].upper()}) -----")
    print(f"Original Post: {row['text_raw'][:200]}...") # Truncate long posts
    print(f"Initial Sentiment Label (Default Negative): {sentiment_map[row['sentiment_label']]}")
    print(f"Simulated BERT-only Prediction: {sentiment_map[row['sentiment_label_bert']]}")
    print(f"Final ECR-BERT Conceptual Prediction: {sentiment_map[row['final_sentiment_predicted']]}")
    print(f"SAFA Reason: {row['SAFA_Reason']}")
    if row['SAFA_PECK']:
        print(f"  Inferred Positive Knowledge (PECK): {row['SAFA_PECK']}")
    if row['SAFA_NECK']:
        print(f"  Inferred Negative Knowledge (NECK): {row['SAFA_NECK']}")
    print(f"SET (Input to conceptual BERT): {row['SET'][:200]}...") # Truncate for display

# Discussion on consistency
print("\n--- Qualitative Analysis Discussion ---")
print("Observations from samples:")
print("1. The 'SAFA_PECK' and 'SAFA_NECK' provide clear, fine-grained emotional categories.")
print("2. The 'SAFA_Reason' explains *why* a particular type of knowledge was chosen for incorporation.")
print("3. In cases where PECK or NECK were incorporated, the 'final_sentiment_predicted' often aligns")
print("   with the polarity of the incorporated knowledge, demonstrating explainability.")
print("4. Discrepancies between 'sentiment_label_bert' and 'final_sentiment_predicted' (when they occur)")
print("   highlight SAFA's role in refining sentiment based on ECR insights.")
print("   (Note: Our simulation might not produce strong discrepancies without specific rules for `simulate_bert_prediction`).")
print("5. Cases with no SAFA_PECK/NECK mean either no emotion words were detected by ECR, or SAFA")
print("   decided not to incorporate knowledge (e.g., low CS_ECR, BERT neutral, or mixed/conflicting ES).")
print("   In these cases, the final prediction defaults to the BERT-only prediction.")


# ====================================================================
# STEP 5.3: CONCEPTUAL ABLATION STUDY (Discussion)
# ====================================================================

print("\n--- CONCEPTUAL ABLATION STUDY (Discussion) ---")
print("An ablation study systematically removes components of a model to understand their individual contribution.")
print("Based on the paper's Fig. 9 and our conceptual implementation, here's what we would expect:")

print("\n**1. Baseline (B - BERT only):**")
print("   - This would be equivalent to our `sentiment_label_bert` (the simulated BERT prediction).")
print("   - It provides a strong baseline, leveraging BERT's pre-trained language understanding.")
print("   - Lacks explainability in terms of fine-grained emotions.")

print("\n**2. BERT + ECR (B, E):**")
print("   - Here, ECR-derived knowledge (PECK/NECK) would be *always* incorporated into BERT's input (SETs), without SAFA.")
print("   - This would likely show some improvement over BERT only, as auxiliary knowledge helps.")
print("   - However, without SAFA, it would suffer from 'knowledge noise' (incorporating conflicting or irrelevant knowledge), potentially leading to less optimal performance than the full model.")

print("\n**3. BERT + ECR + Knowledge-Enabled Feature Representation (B, E, K):**")
print("   - This adds Soft-Position Embedding and Mask-Transformer to (B, E).")
print("   - The paper shows this improves performance over (B, E) because the specialized feature representation better handles the SET structure and reduces noise during processing.")
print("   - It helps BERT *understand* the tree-like structure of the SETs more effectively.")

print("\n**4. BERT + ECR + Self-Adaptive Fusion (B, E, S):**")
print("   - This combines BERT, ECR, and the SAFA, but without the Knowledge-Enabled Feature Representation (soft-positioning/mask-transformer).")
print("   - SAFA's selective incorporation *reduces knowledge noise* by choosing relevant knowledge based on confidence.")
print("   - We would expect this to outperform (B, E) due to improved knowledge quality, but likely be slightly worse than (B, E, K) because the vanilla BERT might still struggle with the complex SET structure without specialized embeddings/attention.")

print("\n**5. ECR-BERT (B, E, S, K - Our Proposed Model):**")
print("   - This is the full model, combining BERT, ECR, Self-Adaptive Fusion, and Knowledge-Enabled Feature Representation.")
print("   - The paper's results (and our expectation) show this achieves the best performance.")
print("   - It leverages the power of BERT, the explainability and insights from ECR, the noise reduction from SAFA, and the structural understanding from knowledge-enabled feature representation.")
print("   - It provides both accurate *and* explainable sentiment analysis results for OPOEs (and in your case, mental health discussions).")

print("\n**Conclusion from Ablation:**")
print("Each component (ECR for knowledge, SAFA for selective fusion, and Knowledge-Enabled Feature Representation for structural understanding) plays a crucial role in enhancing the accuracy and explainability of the ECR-BERT model.")

print("\n--- END OF CONCEPTUAL IMPLEMENTATION ---")
print("To fully implement this ECR-BERT model, you would need to:")
print("1. Train actual BERT models for sentiment prediction (replacing `simulate_bert_prediction`).")
print("2. Implement the Soft-Position Embedding and Mask-Transformer within a deep learning framework (e.g., PyTorch, TensorFlow).")
print("3. Train the full ECR-BERT model end-to-end on your dataset.")
print("4. Conduct thorough quantitative evaluations (accuracy, F1) and fine-tune hyperparameters (like SAFA threshold).")

Loaded final conceptual results for analysis from 'final_ecr_bert_conceptual_results.csv' with 151288 rows.

--- QUALITATIVE ANALYSIS: Inferred Emotions and Sentiment Explanations ---
Comparing original text, SAFA's chosen knowledge, and final predicted sentiment.
The goal is to see how the SAFA_PECK/NECK provide an explanation for the sentiment.

----- Sample 122201 (Source: OCD) -----
Original Post: Hey guys, 

So lately my intrusive thoughts have been really bad and much worse. I’d also like to note that I frequently experience depersonalization/de-realization. Lately my thoughts are trying to c...
Initial Sentiment Label (Default Negative): Negative
Simulated BERT-only Prediction: Negative
Final ECR-BERT Conceptual Prediction: Negative
SAFA Reason: ECR-preferred NECK (high CS_ECR, negative ES)
  Inferred Negative Knowledge (NECK): [['panic', 'Distress'], ['anxiety', 'Distress']]
SET (Input to conceptual BERT): Hey guys, 

So lately my intrusive thoughts have been really bad and muc