# AI-Driven Identification and Management of Somatic Symptom Disorder in Primary Care



In [3]:
# AI-Driven Identification and Management of Somatic Symptom Disorder in Primary Care
# Part 1: Environment Setup, Data Loading and Preprocessing

# This implementation is specifically designed for the CPCSSN dataset as described
# in the Data YAML and Data Description documents.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import warnings
from datetime import datetime, timedelta

# For reproducibility
np.random.seed(42)

# Suppress warnings
warnings.filterwarnings('ignore')

print("Starting SSD identification pipeline...")

#################################
# 1. Environment Setup
#################################

# Check for GPU availability
try:
    import torch
    gpu_available = torch.cuda.is_available()
    if gpu_available:
        device = torch.device("cuda")
        print(f"GPU available: {torch.cuda.get_device_name(0)}")
    else:
        device = torch.device("cpu")
        print("GPU not available, using CPU")
except ImportError:
    gpu_available = False
    device = "cpu"
    print("PyTorch not installed, using CPU")

#################################
# 2. Data Loading Configuration
#################################

# Define paths based on YAML configuration
RAW_DATA_PATH = "extracted_data/"
PREPARED_DATA_PATH = "prepared_data/"
FILE_EXTENSION = ".csv"
CHUNK_SIZE = 500000  # For processing large files

# Define tables to load based on YAML
TABLES = {
    'Encounter': 'Encounter_prepared',
    'EncounterDiagnosis': 'EncounterDiagnosis_prepared',
    'HealthCondition': 'HealthCondition_prepared',
    'Lab': 'Lab_prepared',
    'Medication': 'Medication_prepared',
    'MedicalProcedure': 'MedicalProcedure_prepared',
    'Referral': 'Referral_prepared',
    'RiskFactor': 'RiskFactor_prepared',
    'PatientDemographic_merged': 'PatientDemographic_merged_prepared'
}

# Define date columns for each table based on YAML
DATE_COLUMNS = {
    'Encounter': ['EncounterDate', 'DateCreated'],
    'EncounterDiagnosis': ['DateCreated'],
    'HealthCondition': ['DateCreated'],
    'Lab': ['PerformedDate', 'DateCreated'],
    'MedicalProcedure': ['PerformedDate', 'DateCreated'],
    'Medication': ['StartDate', 'StopDate', 'DateCreated'],
    'Referral': ['CompletedDate', 'DateCreated'],
    'RiskFactor': ['StartDate', 'EndDate', 'DateCreated'],
    'PatientDemographic_merged': ['DateCreated']
}

# Specific SSD-related ICD-9 codes based on validated DSM-IV to DSM-5 crosswalks
SSD_SPECIFIC_ICD9_CODES = [
    '300.81',  # Somatization Disorder (in DSM-5, encompassed by SSD)
    '300.82',  # Undifferentiated Somatoform Disorder / SSD / Unspecified somatic symptom disorder
    '300.7',   # Hypochondriasis/Illness Anxiety Disorder
    '300.11',  # Conversion Disorder (Functional Neurological Symptom Disorder)
    '307.80',  # Pain Disorder associated with psychological factors
    '307.89',  # Pain Disorder with both psychological factors and medical condition
    '316',     # Psychological Factors Affecting Other Medical Conditions
    '300.16',  # Factitious Disorder (predominantly psychological signs)
    '300.19',  # Factitious Disorder (predominantly physical signs)
    '301.51',  # Chronic factitious illness with physical symptoms
    '300.89'   # Other Specified Somatic Symptom and Related Disorders
]

# Broader mental health and somatic symptom ICD-9 code ranges
MENTAL_HEALTH_ICD9_RANGES = [
    (290, 319),  # Primary mental health codes
    (327, 327),  # Sleep disorders
    (331, 333),  # Neurological conditions
    (780, 780),  # Symptoms involving nervous and musculoskeletal systems
    (786, 788),  # Symptoms involving respiratory, digestive, urinary systems
    (799, 799)   # Other ill-defined conditions
]

# Common somatic symptom keywords organized by context
# Clinical Documentation Terminology (from physician notes)
CLINICAL_DOCUMENTATION_KEYWORDS = [
    'medically unexplained symptoms', 'no organic cause', 'no medical explanation',
    'disproportionate to findings', 'somatic complaints', 'psychosomatic', 'psychogenic',
    'functional neurological symptom', 'PNES', 'pseudoseizures', 'conversion disorder',
    'health anxiety', 'illness preoccupation despite reassurance', 'frequent flyer',
    'doctor shopping', 'symptom magnification', 'symptom amplification',
    'out of proportion to examination', 'high healthcare utilization',
    'multiple negative workups', 'extensive negative workup', 'normal tests',
    'benign exam', 'reassurance ineffective', 'multiple somatic symptoms'
]

# Patient-Reported Symptom Descriptions
PATIENT_REPORT_KEYWORDS = [
    'constant pain but doctors can\'t find anything', 'tired all the time',
    'exhausted all the time', 'whole body hurts', 'pain all over',
    'headaches every day and nothing helps', 'short of breath but tests normal',
    'worried I have a serious disease', 'no one understands how bad I feel',
    'doctors must have missed something', 'I know something is wrong',
    'keep checking my body', 'medicine doesn\'t make me better',
    'sensitive to medications', 'stress makes symptoms worse',
    'symptoms across multiple systems', 'headaches stomach pains nausea fatigue',
    'multiple sites of pain', 'symptoms come and go', 'been to many doctors'
]

# NLP-Oriented Phrases and Patterns
NLP_ORIENTED_KEYWORDS = [
    'medically unexplained physical symptoms', 'MUPS', 'multiple unexplained symptoms',
    'symptoms with no clear etiology', 'persistent somatic complaints despite normal workup',
    'excessive health-related behaviors', 'illness worry not alleviated by medical reassurance',
    'high health anxiety', 'functional symptom disorder', 'conversion symptoms',
    'somatic focus', 'somatic preoccupation', 'negative diagnostic cascade',
    'denies relief after reassurance', 'symptom fixation', 'symptom vigilance',
    'health-related fear', 'excessive time and energy on health concerns',
    'catastrophizing health symptoms', 'heightened body awareness'
]

# Consolidated list of all somatic symptom keywords for text analysis
SOMATIC_KEYWORDS = (CLINICAL_DOCUMENTATION_KEYWORDS + 
                   PATIENT_REPORT_KEYWORDS + 
                   NLP_ORIENTED_KEYWORDS)

#################################
# 3. Data Loading Functions
#################################

def load_table(table_name, date_columns=None, nrows=None):
    """
    Load a CPCSSN table with proper date parsing
    
    Args:
        table_name (str): The name of the table
        date_columns (list): List of columns containing dates
        nrows (int): Number of rows to load (None for all)
        
    Returns:
        pandas.DataFrame: The loaded table
    """
    filename = TABLES[table_name] + FILE_EXTENSION
    filepath = os.path.join(PREPARED_DATA_PATH, filename)
    
    print(f"Loading {table_name} from {filepath}...")
    
    # Define date parser for the date columns
    if date_columns:
        date_parser = lambda x: pd.to_datetime(x, errors='coerce')
        parse_dates = date_columns
    else:
        date_parser = None
        parse_dates = None
    
    # Try to load the file
    try:
        if nrows:
            df = pd.read_csv(filepath, 
                             parse_dates=parse_dates, 
                             date_parser=date_parser,
                             nrows=nrows)
        else:
            # For large files, use chunking
            df = pd.read_csv(filepath, 
                             parse_dates=parse_dates, 
                             date_parser=date_parser,
                             chunksize=CHUNK_SIZE)
            # Concatenate chunks
            df = pd.concat(df)
            
        print(f"Successfully loaded {len(df)} rows from {table_name}")
        return df
    
    except Exception as e:
        print(f"Error loading {table_name}: {e}")
        # Return empty DataFrame with expected columns
        return pd.DataFrame(columns=[])

#################################
# 4. Data Preprocessing Functions
#################################

def preprocess_patient_demographics(df):
    """
    Preprocess the PatientDemographic_merged table
    
    Args:
        df (pandas.DataFrame): The raw patient demographics table
        
    Returns:
        pandas.DataFrame: Preprocessed patient demographics
    """
    if len(df) == 0:
        return df
    
    # Keep only essential columns
    essential_cols = [
        'PatientDemographic_ID', 'Patient_ID', 'Network_ID', 'Site_ID',
        'Sex', 'BirthYear', 'BirthMonth', 'ResidencePostalCode',
        'PatientStatus_calc', 'OptedOut'
    ]
    
    # Only keep patients who have not opted out
    df = df[df['OptedOut'] == 0]
    
    # Filter columns
    df = df[essential_cols]
    
    # Calculate age (approximate)
    current_year = datetime.now().year
    df['Age'] = current_year - df['BirthYear']
    
    # Filter to adults only (18+)
    df = df[df['Age'] >= 18]
    
    # Convert categorical variables
    df['Sex'] = df['Sex'].astype('category')
    
    # Extract first 3 digits of postal code for regional analysis
    if 'ResidencePostalCode' in df.columns:
        df['Region'] = df['ResidencePostalCode'].astype(str).str.slice(0, 3)
        df['Region'] = df['Region'].astype('category')
    
    return df

def identify_mental_health_conditions(df):
    """
    Identify patients with mental health conditions from HealthCondition table,
    with specific focus on SSD and related conditions
    
    Args:
        df (pandas.DataFrame): The HealthCondition table
        
    Returns:
        pandas.DataFrame: A DataFrame with Patient_ID and mental health flags
    """
    if len(df) == 0:
        return pd.DataFrame(columns=['Patient_ID', 'has_mental_health_condition', 'has_ssd_related_code'])
    
    # Initialize result DataFrame
    mental_health_patients = pd.DataFrame()
    
    # Function to check if code is in mental health range
    def is_mental_health_code(code):
        if pd.isna(code) or not code:
            return False
        
        # First check specific SSD codes (exact matching)
        if str(code).strip() in SSD_SPECIFIC_ICD9_CODES:
            return True
        
        # Then check range-based codes
        try:
            code_int = int(float(code))
            for start, end in MENTAL_HEALTH_ICD9_RANGES:
                if start <= code_int <= end:
                    return True
            return False
        except:
            return False
    
    # Function to check if code is specifically a SSD-related code
    def is_ssd_related_code(code):
        if pd.isna(code) or not code:
            return False
        
        return str(code).strip() in SSD_SPECIFIC_ICD9_CODES
    
    # Check DiagnosisCode_calc column for mental health codes
    if 'DiagnosisCode_calc' in df.columns:
        df['is_mental_health'] = df['DiagnosisCode_calc'].apply(is_mental_health_code)
        df['is_ssd_related'] = df['DiagnosisCode_calc'].apply(is_ssd_related_code)
        
        # Also check for mental health keywords in DiagnosisText_calc
        mental_health_keywords = [
            'depression', 'anxiety', 'mental', 'psychiatric', 'psychological',
            'mood', 'somatoform', 'somatic symptom', 'hypochondriasis', 'illness anxiety'
        ]
        
        # Additional SSD-specific keywords
        ssd_specific_keywords = [
            'somatic symptom disorder', 'somatization', 'somatoform', 'psychosomatic', 
            'conversion disorder', 'functional neurological', 'illness anxiety',
            'hypochondriasis', 'factitious disorder', 'pain disorder',
            'medically unexplained', 'psychological factors affecting'
        ]
        
        if 'DiagnosisText_calc' in df.columns:
            # General mental health check
            text_pattern = '|'.join(mental_health_keywords)
            df['has_mental_health_text'] = df['DiagnosisText_calc'].astype(str).str.contains(
                text_pattern, case=False, na=False
            )
            
            # SSD-specific text check
            ssd_text_pattern = '|'.join(ssd_specific_keywords)
            df['has_ssd_text'] = df['DiagnosisText_calc'].astype(str).str.contains(
                ssd_text_pattern, case=False, na=False
            )
            
            # Combine code and text indicators
            df['has_mental_health'] = df['is_mental_health'] | df['has_mental_health_text']
            df['has_ssd_related'] = df['is_ssd_related'] | df['has_ssd_text']
        else:
            df['has_mental_health'] = df['is_mental_health']
            df['has_ssd_related'] = df['is_ssd_related']
        
        # Get unique patients with mental health conditions and SSD conditions
        mental_health_patients = df[df['has_mental_health']][['Patient_ID', 'has_ssd_related']].drop_duplicates()
        mental_health_patients['has_mental_health_condition'] = 1
        
        # Rename for clarity
        mental_health_patients = mental_health_patients.rename(
            columns={'has_ssd_related': 'has_ssd_related_code'}
        )
        
        # Fill missing values
        mental_health_patients['has_ssd_related_code'] = mental_health_patients['has_ssd_related_code'].fillna(0).astype(int)
    
    return mental_health_patients

def calculate_patient_encounter_stats(encounter_df, encounter_diag_df, start_date, end_date):
    """
    Calculate patient encounter statistics for a specific time period
    
    Args:
        encounter_df (pandas.DataFrame): The Encounter table
        encounter_diag_df (pandas.DataFrame): The EncounterDiagnosis table
        start_date (datetime): Start date for the analysis period
        end_date (datetime): End date for the analysis period
        
    Returns:
        pandas.DataFrame: Patient-level encounter statistics
    """
    if len(encounter_df) == 0 or len(encounter_diag_df) == 0:
        return pd.DataFrame()
    
    # Print column names for debugging
    print(f"Encounter DataFrame columns: {encounter_df.columns.tolist()}")
    print(f"EncounterDiagnosis DataFrame columns: {encounter_diag_df.columns.tolist()}")
    
    # Make a copy to avoid modifying the original
    encounter_df_copy = encounter_df.copy()
    encounter_diag_df_copy = encounter_diag_df.copy()
    
    # Ensure required columns exist
    required_encounter_cols = ['Encounter_ID', 'Patient_ID', 'EncounterDate']
    required_diag_cols = ['Encounter_ID', 'Patient_ID', 'DiagnosisCode_calc']
    
    # Check if all required columns exist in encounter_df
    if not all(col in encounter_df_copy.columns for col in required_encounter_cols):
        print(f"Warning: Missing required columns in Encounter table. Required: {required_encounter_cols}")
        return pd.DataFrame()
    
    # Check if all required columns exist in encounter_diag_df
    if not all(col in encounter_diag_df_copy.columns for col in required_diag_cols):
        print(f"Warning: Missing required columns in EncounterDiagnosis table. Required: {required_diag_cols}")
        if 'DiagnosisCode_calc' not in encounter_diag_df_copy.columns:
            # If we don't have diagnosis codes, just return encounter counts
            if 'EncounterDate' in encounter_df_copy.columns:
                valid_encounters = encounter_df_copy[
                    (encounter_df_copy['EncounterDate'] >= start_date) & 
                    (encounter_df_copy['EncounterDate'] <= end_date)
                ]
            else:
                valid_encounters = encounter_df_copy
                
            encounter_counts = valid_encounters.groupby('Patient_ID').size().reset_index(name='encounter_count')
            return encounter_counts
    
    # Filter encounters within the date range
    if 'EncounterDate' in encounter_df_copy.columns:
        valid_encounters = encounter_df_copy[
            (encounter_df_copy['EncounterDate'] >= start_date) & 
            (encounter_df_copy['EncounterDate'] <= end_date)
        ]
    else:
        valid_encounters = encounter_df_copy
    
    # Count encounters per patient
    encounter_counts = valid_encounters.groupby('Patient_ID').size().reset_index(name='encounter_count')
    
    # Ensure we have Encounter_ID in the diagnosis data
    if 'Encounter_ID' not in encounter_diag_df_copy.columns:
        # If we don't have encounter linkage, just return encounter counts
        return encounter_counts
    
    # Join with encounter diagnosis to get diagnosis info
    # Use a common set of join columns that are definitely in both DataFrames
    join_columns = []
    for col in ['Encounter_ID', 'Patient_ID']:
        if col in valid_encounters.columns and col in encounter_diag_df_copy.columns:
            join_columns.append(col)
    
    if not join_columns:
        print("Warning: No common columns to join Encounter and EncounterDiagnosis tables")
        return encounter_counts
    
    # Print the join columns for debugging
    print(f"Joining on columns: {join_columns}")
    
    # Perform the join to get diagnosis info, ensuring Patient_ID is preserved
    encounter_with_diag = pd.merge(
        valid_encounters[join_columns],
        encounter_diag_df_copy,
        on=join_columns,
        how='left'
    )
    
    # Function to check if code is a symptom code (780-789 range)
    def is_symptom_code(code):
        if pd.isna(code) or not code:
            return False
        
        try:
            code_int = int(float(code))
            return 780 <= code_int <= 789
        except:
            return False
    
    # Identify symptom-related diagnoses
    if 'DiagnosisCode_calc' in encounter_with_diag.columns:
        encounter_with_diag['is_symptom_code'] = encounter_with_diag['DiagnosisCode_calc'].apply(is_symptom_code)
        
        # Check if Patient_ID exists in the merged DataFrame
        if 'Patient_ID' not in encounter_with_diag.columns:
            print("Warning: Patient_ID missing after merge. Cannot group by Patient_ID.")
            return encounter_counts
        
        # Count symptom codes per patient
        symptom_df = encounter_with_diag[encounter_with_diag['is_symptom_code']]
        if len(symptom_df) > 0:
            symptom_counts = symptom_df.groupby('Patient_ID').size().reset_index(
                name='symptom_code_count'
            )
            
            # Merge encounter counts with symptom counts
            patient_stats = pd.merge(encounter_counts, symptom_counts, on='Patient_ID', how='left')
            patient_stats['symptom_code_count'] = patient_stats['symptom_code_count'].fillna(0)
            
            # Calculate ratio of symptom codes to total encounters
            patient_stats['symptom_encounter_ratio'] = patient_stats['symptom_code_count'] / patient_stats['encounter_count']
            
            return patient_stats
        else:
            # No symptom codes found, just add a zero column
            encounter_counts['symptom_code_count'] = 0
            encounter_counts['symptom_encounter_ratio'] = 0
            return encounter_counts
    else:
        return encounter_counts

def get_lab_test_patterns(lab_df, start_date, end_date):
    """
    Analyze patterns in lab tests for patients
    
    Args:
        lab_df (pandas.DataFrame): The Lab table
        start_date (datetime): Start date for the analysis period
        end_date (datetime): End date for the analysis period
        
    Returns:
        pandas.DataFrame: Patient-level lab test statistics
    """
    if len(lab_df) == 0:
        return pd.DataFrame()
    
    # Filter lab tests within date range
    if 'PerformedDate' in lab_df.columns:
        valid_labs = lab_df[
            (lab_df['PerformedDate'] >= start_date) & 
            (lab_df['PerformedDate'] <= end_date)
        ]
    else:
        valid_labs = lab_df
    
    # Count total lab tests per patient
    lab_counts = valid_labs.groupby('Patient_ID').size().reset_index(name='total_lab_tests')
    
    # Identify normal results
    # Approach 1: Check if result is within normal range
    if all(col in valid_labs.columns for col in ['TestResult_calc', 'LowerNormal', 'UpperNormal']):
        try:
            # Convert numeric columns
            valid_labs['TestResult_num'] = pd.to_numeric(valid_labs['TestResult_calc'], errors='coerce')
            valid_labs['LowerNormal_num'] = pd.to_numeric(valid_labs['LowerNormal'], errors='coerce')
            valid_labs['UpperNormal_num'] = pd.to_numeric(valid_labs['UpperNormal'], errors='coerce')
            
            # Check if result is within normal range
            valid_labs['is_normal'] = (
                (valid_labs['TestResult_num'] >= valid_labs['LowerNormal_num']) & 
                (valid_labs['TestResult_num'] <= valid_labs['UpperNormal_num'])
            )
        except:
            # Fall back to text-based approach
            valid_labs['is_normal'] = valid_labs['TestResult_calc'].astype(str).str.contains(
                'normal|negative|unremarkable', case=False, na=False
            )
    # Approach 2: Look for keywords in result text
    else:
        valid_labs['is_normal'] = valid_labs['TestResult_calc'].astype(str).str.contains(
            'normal|negative|unremarkable', case=False, na=False
        )
    
    # Count normal lab tests per patient
    normal_counts = valid_labs[valid_labs['is_normal']].groupby('Patient_ID').size().reset_index(
        name='normal_lab_tests'
    )
    
    # Merge lab counts with normal counts
    lab_stats = pd.merge(lab_counts, normal_counts, on='Patient_ID', how='left')
    lab_stats['normal_lab_tests'] = lab_stats['normal_lab_tests'].fillna(0)
    
    # Calculate ratio of normal to total tests
    lab_stats['normal_test_ratio'] = lab_stats['normal_lab_tests'] / lab_stats['total_lab_tests']
    
    # Calculate distinct test types per patient
    test_diversity = valid_labs.groupby('Patient_ID')['Name_calc'].nunique().reset_index(
        name='unique_test_types'
    )
    
    # Merge with lab stats
    lab_stats = pd.merge(lab_stats, test_diversity, on='Patient_ID', how='left')
    
    return lab_stats

def analyze_referral_patterns(referral_df, start_date, end_date):
    """
    Analyze patterns in referrals for patients
    
    Args:
        referral_df (pandas.DataFrame): The Referral table
        start_date (datetime): Start date for the analysis period
        end_date (datetime): End date for the analysis period
        
    Returns:
        pandas.DataFrame: Patient-level referral statistics
    """
    if len(referral_df) == 0:
        return pd.DataFrame()
    
    # Filter referrals within date range
    if 'CompletedDate' in referral_df.columns:
        valid_referrals = referral_df[
            (referral_df['CompletedDate'] >= start_date) & 
            (referral_df['CompletedDate'] <= end_date)
        ]
    else:
        valid_referrals = referral_df
    
    # Count total referrals per patient
    referral_counts = valid_referrals.groupby('Patient_ID').size().reset_index(name='total_referrals')
    
    # Get unique specialists per patient
    specialist_diversity = valid_referrals.groupby('Patient_ID')['Name_calc'].nunique().reset_index(
        name='unique_specialists'
    )
    
    # Merge referral counts with specialist diversity
    referral_stats = pd.merge(referral_counts, specialist_diversity, on='Patient_ID', how='left')
    
    return referral_stats

def analyze_medication_patterns(medication_df, start_date, end_date):
    """
    Analyze patterns in medications for patients
    
    Args:
        medication_df (pandas.DataFrame): The Medication table
        start_date (datetime): Start date for the analysis period
        end_date (datetime): End date for the analysis period
        
    Returns:
        pandas.DataFrame: Patient-level medication statistics
    """
    if len(medication_df) == 0:
        return pd.DataFrame()
    
    # Filter medications within date range
    if 'StartDate' in medication_df.columns:
        valid_medications = medication_df[
            (medication_df['StartDate'] >= start_date) | 
            ((medication_df['StopDate'] >= start_date) & (medication_df['StopDate'] <= end_date)) |
            pd.isna(medication_df['StopDate'])
        ]
    else:
        valid_medications = medication_df
    
    # Count total medications per patient
    med_counts = valid_medications.groupby('Patient_ID').size().reset_index(name='total_medications')
    
    # Identify psychotropic medications
    psychotropic_keywords = [
        'antidepressant', 'ssri', 'snri', 'anxiolytic', 'benzodiazepine',
        'antipsychotic', 'mood stabilizer', 'prozac', 'zoloft', 'paxil',
        'celexa', 'lexapro', 'effexor', 'cymbalta', 'wellbutrin', 'xanax',
        'ativan', 'klonopin', 'valium', 'risperdal', 'zyprexa', 'seroquel',
        'lithium', 'depakote', 'lamictal'
    ]
    
    if 'Name_calc' in valid_medications.columns:
        text_pattern = '|'.join(psychotropic_keywords)
        valid_medications['is_psychotropic'] = valid_medications['Name_calc'].astype(str).str.contains(
            text_pattern, case=False, na=False
        )
        
        # Count psychotropic medications per patient
        psychotropic_counts = valid_medications[valid_medications['is_psychotropic']].groupby('Patient_ID').size().reset_index(
            name='psychotropic_count'
        )
        
        # Merge medication counts with psychotropic counts
        med_stats = pd.merge(med_counts, psychotropic_counts, on='Patient_ID', how='left')
        med_stats['psychotropic_count'] = med_stats['psychotropic_count'].fillna(0)
        
        # Calculate ratio of psychotropic to total medications
        med_stats['psychotropic_ratio'] = med_stats['psychotropic_count'] / med_stats['total_medications']
        
        return med_stats
    else:
        return med_counts

def analyze_text_fields(df, text_column, keyword_categories=None):
    """
    Analyze text fields for somatic symptom keywords with detailed categorization
    
    Args:
        df (pandas.DataFrame): DataFrame containing text column
        text_column (str): Name of the column containing text
        keyword_categories (dict): Dictionary of keyword categories. If None, uses default categories
        
    Returns:
        pandas.DataFrame: DataFrame with Patient_ID and keyword counts by category
    """
    if len(df) == 0 or text_column not in df.columns:
        return pd.DataFrame()
    
    # If no categories provided, use defaults
    if keyword_categories is None:
        keyword_categories = {
            'clinical_documentation': CLINICAL_DOCUMENTATION_KEYWORDS,
            'patient_report': PATIENT_REPORT_KEYWORDS, 
            'nlp_oriented': NLP_ORIENTED_KEYWORDS
        }
    
    # Make a copy to avoid modifying the original
    df_copy = df.copy()
    
    # Create a column for total keyword count across all categories
    df_copy['total_keyword_count'] = 0
    
    # Process each keyword category separately
    for category, keywords in keyword_categories.items():
        # Create a regex pattern for this category - make it case insensitive
        pattern = '|'.join([re.escape(kw.lower()) for kw in keywords])
        
        # Count occurrences of keywords in text for this category
        category_col = f'keywords_{category}_count'
        
        # Convert to lowercase for case-insensitive matching and then count
        df_copy[category_col] = df_copy[text_column].astype(str).str.lower().str.count(pattern)
        
        # Add to total count
        df_copy['total_keyword_count'] += df_copy[category_col]
        
        # Also flag if any keywords from this category are present
        df_copy[f'has_{category}_keywords'] = df_copy[category_col] > 0
    
    # Calculate patient-level aggregates
    agg_dict = {
        'total_keyword_count': 'sum',
        text_column: 'count'  # Count total documents
    }
    
    # Add aggregations for each category
    for category in keyword_categories.keys():
        agg_dict[f'keywords_{category}_count'] = 'sum'
        agg_dict[f'has_{category}_keywords'] = 'sum'
    
    # Group by patient
    result = df_copy.groupby('Patient_ID').agg(agg_dict).reset_index()
    
    # Rename the document count column
    result = result.rename(columns={text_column: 'total_documents'})
    
    # Calculate proportion of documents with ANY keywords
    result['documents_with_keywords'] = (df_copy['total_keyword_count'] > 0).groupby(df_copy['Patient_ID']).sum().values
    result['keyword_document_ratio'] = result['documents_with_keywords'] / result['total_documents']
    
    # Calculate proportion for each category
    for category in keyword_categories.keys():
        result[f'{category}_document_ratio'] = result[f'has_{category}_keywords'] / result['total_documents']
    
    # Calculate the enrichment score - a measure of how concentrated the somatic symptom language is
    # Higher values indicate more somatic language per document, which may correlate with SSD severity
    result['somatic_language_density'] = result['total_keyword_count'] / result['total_documents']
    
    return result

#################################
# 5. Main Data Loading Function
#################################

def load_and_preprocess_cpcssn_data(sample_size=None):
    """
    Load and preprocess all necessary CPCSSN tables for SSD analysis
    
    Args:
        sample_size (int): Number of patients to sample (None for all)
        
    Returns:
        dict: Dictionary of DataFrames and stats ready for analysis
    """
    print("\nLoading and preprocessing CPCSSN data...")
    
    # Load patient demographics first
    patient_df = load_table('PatientDemographic_merged', 
                          date_columns=DATE_COLUMNS['PatientDemographic_merged'],
                          nrows=sample_size)
    
    # Basic preprocessing of patient data
    patient_df = preprocess_patient_demographics(patient_df)
    
    # If we want a sample, filter to those patient IDs only
    if sample_size and len(patient_df) > 0:
        patient_sample = patient_df.sample(min(sample_size, len(patient_df)), random_state=42)
        patient_ids = patient_sample['Patient_ID'].unique()
        print(f"Sampled {len(patient_ids)} patients for analysis")
    else:
        patient_ids = patient_df['Patient_ID'].unique() if len(patient_df) > 0 else []
    
    # Define analysis period (e.g., last 2 years)
    end_date = datetime.now()
    start_date = end_date - timedelta(days=730)  # 2 years
    
    print(f"Analysis period: {start_date.date()} to {end_date.date()}")
    
    # Initialize results dictionary
    results = {
        'patients': patient_df,
        'analysis_period': {
            'start_date': start_date,
            'end_date': end_date
        }
    }
    
    # Load and process other tables if we have patient data
    if len(patient_df) > 0:
        
        # Load HealthCondition table and identify mental health conditions
        health_condition_df = load_table('HealthCondition', 
                                       date_columns=DATE_COLUMNS['HealthCondition'],
                                       nrows=None)
        
        # Filter to our patient sample if applicable
        if len(patient_ids) > 0:
            health_condition_df = health_condition_df[health_condition_df['Patient_ID'].isin(patient_ids)]
        
        # Identify mental health conditions with specific focus on SSD-related codes
        mental_health_df = identify_mental_health_conditions(health_condition_df)
        results['mental_health'] = mental_health_df
        
        # Load Encounter and EncounterDiagnosis tables
        encounter_df = load_table('Encounter', 
                                date_columns=DATE_COLUMNS['Encounter'],
                                nrows=None)
        
        encounter_diag_df = load_table('EncounterDiagnosis', 
                                      date_columns=DATE_COLUMNS['EncounterDiagnosis'],
                                      nrows=None)
        
        # Filter to our patient sample if applicable
        if len(patient_ids) > 0:
            encounter_df = encounter_df[encounter_df['Patient_ID'].isin(patient_ids)]
            encounter_diag_df = encounter_diag_df[encounter_diag_df['Patient_ID'].isin(patient_ids)]
        
        # Calculate encounter statistics
        encounter_stats = calculate_patient_encounter_stats(
            encounter_df, encounter_diag_df, start_date, end_date
        )
        results['encounter_stats'] = encounter_stats
        
        # Analyze text fields for SSD-related terminology
        # First check chief complaint in Encounter table
        if 'Reason_orig' in encounter_df.columns:
            reason_text_analysis = analyze_text_fields(encounter_df, 'Reason_orig')
            results['reason_text_analysis'] = reason_text_analysis
        
        # Also analyze DiagnosisText fields in EncounterDiagnosis for more context
        if 'DiagnosisText_orig' in encounter_diag_df.columns:
            diagnosis_text_analysis = analyze_text_fields(encounter_diag_df, 'DiagnosisText_orig')
            results['diagnosis_text_analysis'] = diagnosis_text_analysis
        
        # And check DiagnosisText in HealthCondition for chronic conditions
        if 'DiagnosisText_orig' in health_condition_df.columns:
            condition_text_analysis = analyze_text_fields(health_condition_df, 'DiagnosisText_orig')
            results['condition_text_analysis'] = condition_text_analysis
        
        # Load and analyze Lab tests
        lab_df = load_table('Lab', 
                          date_columns=DATE_COLUMNS['Lab'],
                          nrows=None)
        
        if len(patient_ids) > 0:
            lab_df = lab_df[lab_df['Patient_ID'].isin(patient_ids)]
        
        lab_stats = get_lab_test_patterns(lab_df, start_date, end_date)
        results['lab_stats'] = lab_stats
        
        # Load and analyze Referrals
        referral_df = load_table('Referral', 
                               date_columns=DATE_COLUMNS['Referral'],
                               nrows=None)
        
        if len(patient_ids) > 0:
            referral_df = referral_df[referral_df['Patient_ID'].isin(patient_ids)]
        
        referral_stats = analyze_referral_patterns(referral_df, start_date, end_date)
        results['referral_stats'] = referral_stats
        
        # Load and analyze Medications
        medication_df = load_table('Medication', 
                                 date_columns=DATE_COLUMNS['Medication'],
                                 nrows=None)
        
        if len(patient_ids) > 0:
            medication_df = medication_df[medication_df['Patient_ID'].isin(patient_ids)]
        
        medication_stats = analyze_medication_patterns(medication_df, start_date, end_date)
        results['medication_stats'] = medication_stats
        
        # Load MedicalProcedure to check for multiple diagnostic procedures
        procedure_df = load_table('MedicalProcedure',
                                date_columns=DATE_COLUMNS['MedicalProcedure'],
                                nrows=None)
        
        if len(patient_ids) > 0:
            procedure_df = procedure_df[procedure_df['Patient_ID'].isin(patient_ids)]
        
        # Analyze procedures - count total procedures and diagnostic procedures per patient
        if len(procedure_df) > 0:
            # Identify diagnostic procedures vs treatments
            diagnostic_keywords = [
                'endoscopy', 'biopsy', 'x-ray', 'xray', 'mri', 'ct', 'scan', 
                'ultrasound', 'echo', 'ecg', 'ekg', 'imaging', 'diagnostic', 
                'examination', 'assessment', 'test', 'evaluation'
            ]
            
            if 'Name_calc' in procedure_df.columns:
                # Create pattern for matching diagnostic procedures
                diagnostic_pattern = '|'.join(diagnostic_keywords)
                procedure_df['is_diagnostic'] = procedure_df['Name_calc'].astype(str).str.contains(
                    diagnostic_pattern, case=False, na=False
                )
                
                # Group by patient and count
                procedure_stats = procedure_df.groupby('Patient_ID').agg(
                    total_procedures=('MedicalProcedure_ID', 'count'),
                    diagnostic_procedures=('is_diagnostic', 'sum')
                ).reset_index()
                
                # Calculate ratio of diagnostic to total procedures
                procedure_stats['diagnostic_ratio'] = procedure_stats['diagnostic_procedures'] / procedure_stats['total_procedures']
                
                results['procedure_stats'] = procedure_stats
    
    return results

# Example usage: Load a small sample for development
if __name__ == "__main__":
    # For development, use a small sample
    sample_size = 1000  # Set to None for all patients
    
    # Load and preprocess data
    data = load_and_preprocess_cpcssn_data(sample_size=sample_size)
    
    # Print some basic statistics
    print("\nBasic statistics:")
    
    if 'patients' in data and len(data['patients']) > 0:
        print(f"Total patients: {len(data['patients'])}")
        print(f"Age distribution: {data['patients']['Age'].describe()}")
        print(f"Gender distribution: {data['patients']['Sex'].value_counts()}")
    
    if 'encounter_stats' in data and len(data['encounter_stats']) > 0:
        print(f"\nEncounter statistics:")
        print(f"Average encounters per patient: {data['encounter_stats']['encounter_count'].mean():.2f}")
        if 'symptom_code_count' in data['encounter_stats'].columns:
            print(f"Average symptom codes per patient: {data['encounter_stats']['symptom_code_count'].mean():.2f}")
        
    if 'lab_stats' in data and len(data['lab_stats']) > 0:
        print(f"\nLab test statistics:")
        print(f"Average lab tests per patient: {data['lab_stats']['total_lab_tests'].mean():.2f}")
        if 'normal_test_ratio' in data['lab_stats'].columns:
            print(f"Average normal test ratio: {data['lab_stats']['normal_test_ratio'].mean():.2f}")
    
    print("\nData preprocessing complete.")

Starting SSD identification pipeline...
GPU available: NVIDIA RTX A1000 6GB Laptop GPU

Loading and preprocessing CPCSSN data...
Loading PatientDemographic_merged from prepared_data/PatientDemographic_merged_prepared.csv...
Successfully loaded 1000 rows from PatientDemographic_merged
Sampled 995 patients for analysis
Analysis period: 2023-02-26 to 2025-02-25
Loading HealthCondition from prepared_data/HealthCondition_prepared.csv...
Successfully loaded 2571583 rows from HealthCondition
Loading Encounter from prepared_data/Encounter_prepared.csv...
Successfully loaded 11577739 rows from Encounter
Loading EncounterDiagnosis from prepared_data/EncounterDiagnosis_prepared.csv...
Successfully loaded 12471764 rows from EncounterDiagnosis
Encounter DataFrame columns: ['Encounter_ID', 'Network_ID', 'Site_ID', 'Patient_ID', 'Provider_ID', 'Cycle_ID', 'EncounterDate', 'Reason_orig', 'Reason_calc', 'EncounterType', 'DateCreated']
EncounterDiagnosis DataFrame columns: ['EncounterDiagnosis_ID', 'Net

## Part 2: Feature Engineering and Analysis

I'll start with creating a master feature engineering function that will integrate all the patient data into a unified dataset with SSD-relevant features.

In [5]:
!pip install shap

Collecting shap
  Downloading shap-0.46.0-cp312-cp312-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp312-cp312-win_amd64.whl (456 kB)
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8




In [11]:
# AI-Driven Identification and Management of Somatic Symptom Disorder in Primary Care
# Part 2: Feature Engineering and Data Integration

display(Markdown("# Part 2: Feature Engineering and Data Integration"))

display(Markdown("""
## Research Questions and Hypothesis

**Primary Research Question:**  
Can we identify patients with probable Somatic Symptom Disorder (SSD) using EMR data from Canadian primary care practices?

**Secondary Questions:**  
1. What EMR features most strongly correlate with SSD patterns?
2. How does healthcare utilization differ between patients with high vs. low somatic symptom burden?
3. Can we identify clusters or subtypes within the SSD phenotype?

**Hypothesis:**  
We hypothesize that patients with SSD patterns will show:
1. Higher frequency of primary care visits
2. Multiple unexplained symptoms coded across body systems
3. Higher proportion of normal lab results
4. More frequent referrals to specialists
5. Presence of mental health comorbidities (especially anxiety)
6. Evidence of somatic language in clinical notes
"""))

display(Markdown("""
## Feature Engineering Plan

We will transform the preprocessed data into a unified patient-level feature set for analysis.
Our engineering approach includes:

1. **Basic Utilization Features**: Visit counts, visit frequency, lab test counts
2. **SSD-Specific Features**: SSD codes, unexplained symptom patterns, normal test ratios
3. **Text-Derived Features**: Somatic language markers in clinical notes 
4. **Temporal Pattern Features**: Persistence of symptoms, changing patterns over time
5. **Interaction Features**: Combinations of symptoms, mental health, and utilization
"""))

# Display brief summary of data we have so far
tables_summary = pd.DataFrame(columns=['Table', 'Rows', 'Patients', 'Date Range'])

# Part 2: Feature Engineering and Data Integration


## Research Questions and Hypothesis

**Primary Research Question:**  
Can we identify patients with probable Somatic Symptom Disorder (SSD) using EMR data from Canadian primary care practices?

**Secondary Questions:**  
1. What EMR features most strongly correlate with SSD patterns?
2. How does healthcare utilization differ between patients with high vs. low somatic symptom burden?
3. Can we identify clusters or subtypes within the SSD phenotype?

**Hypothesis:**  
We hypothesize that patients with SSD patterns will show:
1. Higher frequency of primary care visits
2. Multiple unexplained symptoms coded across body systems
3. Higher proportion of normal lab results
4. More frequent referrals to specialists
5. Presence of mental health comorbidities (especially anxiety)
6. Evidence of somatic language in clinical notes



## Feature Engineering Plan

We will transform the preprocessed data into a unified patient-level feature set for analysis.
Our engineering approach includes:

1. **Basic Utilization Features**: Visit counts, visit frequency, lab test counts
2. **SSD-Specific Features**: SSD codes, unexplained symptom patterns, normal test ratios
3. **Text-Derived Features**: Somatic language markers in clinical notes 
4. **Temporal Pattern Features**: Persistence of symptoms, changing patterns over time
5. **Interaction Features**: Combinations of symptoms, mental health, and utilization


In [3]:
# First, we need to load our preprocessed data
# Use the function from Part 1
data = load_and_preprocess_cpcssn_data(sample_size=1000)  # Using same sample size as in Part 1

# Summarize the data we've processed 
tables_summary = []

if 'patients' in data and len(data['patients']) > 0:
    patient_count = len(data['patients'])
    tables_summary.append({
        'Table': 'Patient Demographics',
        'Rows': patient_count,
        'Patients': patient_count,
        'Key Metrics': f"Mean Age: {data['patients']['Age'].mean():.1f}, Female: {(data['patients']['Sex'] == 'FEMALE').mean():.1%}"
    })

if 'encounter_stats' in data and len(data['encounter_stats']) > 0:
    ec_df = data['encounter_stats']
    tables_summary.append({
        'Table': 'Encounters',
        'Rows': '-',
        'Patients': len(ec_df),
        'Key Metrics': f"Avg encounters/patient: {ec_df['encounter_count'].mean():.1f}, Symptom codes: {ec_df.get('symptom_code_count', pd.Series([0])).mean():.1f}"
    })

if 'lab_stats' in data and len(data['lab_stats']) > 0:
    lab_df = data['lab_stats']
    tables_summary.append({
        'Table': 'Lab Tests',
        'Rows': '-',
        'Patients': len(lab_df),
        'Key Metrics': f"Avg tests/patient: {lab_df['total_lab_tests'].mean():.1f}, Normal ratio: {lab_df['normal_test_ratio'].mean():.2f}"
    })

if 'referral_stats' in data and len(data['referral_stats']) > 0:
    ref_df = data['referral_stats']
    tables_summary.append({
        'Table': 'Referrals',
        'Rows': '-',
        'Patients': len(ref_df),
        'Key Metrics': f"Avg referrals/patient: {ref_df['total_referrals'].mean():.1f}"
    })

if 'mental_health' in data and len(data['mental_health']) > 0:
    mh_df = data['mental_health']
    tables_summary.append({
        'Table': 'Mental Health',
        'Rows': '-',
        'Patients': len(mh_df),
        'Key Metrics': f"With MH condition: {mh_df['has_mental_health_condition'].mean():.1%}, SSD codes: {mh_df['has_ssd_related_code'].mean():.1%}"
    })

# Also check for procedure and text analysis data
if 'procedure_stats' in data and len(data['procedure_stats']) > 0:
    proc_df = data['procedure_stats']
    tables_summary.append({
        'Table': 'Medical Procedures',
        'Rows': '-',
        'Patients': len(proc_df),
        'Key Metrics': f"Diagnostic procedures/patient: {proc_df['diagnostic_procedures'].mean():.1f}"
    })

if 'reason_text_analysis' in data and len(data['reason_text_analysis']) > 0:
    text_df = data['reason_text_analysis']
    tables_summary.append({
        'Table': 'Text Analysis (Reason)',
        'Rows': '-',
        'Patients': len(text_df),
        'Key Metrics': f"Somatic language density: {text_df['somatic_language_density'].mean():.3f}"
    })

tables_df = pd.DataFrame(tables_summary)
display(Markdown("### Data Summary:"))
display(tables_df)

# Begin feature engineering by creating a master patient feature dataframe
display(Markdown("## Feature Engineering"))
display(Markdown("### 1. Creating Patient-Level Feature Matrix"))

# Start with demographics as the base
if 'patients' in data and len(data['patients']) > 0:
    patient_features = data['patients'][['Patient_ID', 'Age', 'Sex']].copy()
    
    # Convert Sex to binary for modeling purposes (will be revised if other genders present)
    patient_features['is_female'] = (patient_features['Sex'] == 'FEMALE').astype(int)
    
    # Display progress
    print(f"Created base feature matrix with {len(patient_features)} patients")
    print(f"Starting features: {list(patient_features.columns)}")
else:
    print("Error: No patient data available to create feature matrix")
    patient_features = pd.DataFrame(columns=['Patient_ID'])

# Add encounter features if available
if 'encounter_stats' in data and len(data['encounter_stats']) > 0:
    enc_df = data['encounter_stats']
    enc_features = ['encounter_count']
    
    # Add symptom code features if present
    if 'symptom_code_count' in enc_df.columns:
        enc_features.append('symptom_code_count')
    if 'symptom_encounter_ratio' in enc_df.columns:
        enc_features.append('symptom_encounter_ratio')
    
    # Merge features
    patient_features = pd.merge(
        patient_features,
        enc_df[['Patient_ID'] + enc_features],
        on='Patient_ID',
        how='left'
    )
    
    # Fill NAs for patients with no encounters
    for col in enc_features:
        patient_features[col] = patient_features[col].fillna(0)
    
    print(f"Added encounter features: {enc_features}")

# Display first few rows of our evolving feature matrix
display(Markdown("### Current Feature Matrix Preview:"))
display(patient_features.head())

NameError: name 'load_and_preprocess_cpcssn_data' is not defined