# Experiment 1: Causal Effect of Negative Lab Tests on Healthcare Utilization Mediated by SSD Severity

In [6]:
# 00_setup_and_protocol.ipynb
"""
SSD Causal Analysis Protocol
Version: 1.0
Date: February 24, 2025
"""


'\nSSD Causal Analysis Protocol\nVersion: 1.0\nDate: February 24, 2025\n'

In [7]:
#%% [markdown]
# # 1. Environment Setup and Validation

#%%
# First, let's check and install required packages
import sys
import subprocess

def ensure_dependencies():
    """Install required packages if not present"""
    required_packages = {
        'pandas': 'pandas',
        'numpy': 'numpy',
        'matplotlib': 'matplotlib',
        'seaborn': 'seaborn',
        'pyyaml': 'pyyaml'
    }
    
    for package, pip_name in required_packages.items():
        try:
            __import__(package)
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', pip_name])
            print(f"Done")
        


ensure_dependencies()

Installing pyyaml...
Done


In [8]:

#%%
# Core imports
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import logging
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown, HTML
from typing import Dict, List, Tuple

# Set up basic configuration
def setup_environment():
    """Configure the notebook environment"""
    # Set random seed
    np.random.seed(42)
    
    # Configure pandas display
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', 100)
    
    # Configure matplotlib
    plt.style.use('default')  # Using default style instead of seaborn
    
    # Set up logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    
    return logging.getLogger(__name__)

logger = setup_environment()
%matplotlib inline


In [9]:

#%%
# Project structure setup
def setup_project_structure():
    """Create necessary project directories"""
    directories = [
        '../data',
        '../results',
        '../figures',
        '../logs'
    ]
    
    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)
        logger.info(f"Created directory: {directory}")

setup_project_structure()

#%%
# Load and validate configuration
def load_config() -> Dict:
    """
    Load study configuration
    
    Returns:
        Dict: Configuration settings
    """
    # Default configuration if file doesn't exist
    default_config = {
        'data_paths': {
            'raw_data': r'C:\Users\ProjectC4M\Documents\CPCSSN Datasets Care4Mind\New Extraction Feb 2025\extracted_data',
            'prepared_data': '../100k_sample'
        },
        'study_parameters': {
            'baseline_period': '2018-01-01',
            'follow_up_period': '2020-12-31',
            'min_age': 18,
            'max_age': 85
        }
    }
    
    config_path = Path('../config/config.yaml')
    
    try:
        if config_path.exists():
            with open(config_path) as f:
                config = yaml.safe_load(f)
            logger.info("Loaded configuration from file")
        else:
            config = default_config
            with open(config_path, 'w') as f:
                yaml.dump(config, f)
            logger.info("Created default configuration file")
            
        return config
        
    except Exception as e:
        logger.error(f"Configuration error: {str(e)}")
        return default_config

config = load_config()


2025-02-24 02:44:01,041 - INFO - Created directory: ../data
2025-02-24 02:44:01,041 - INFO - Created directory: ../results
2025-02-24 02:44:01,041 - INFO - Created directory: ../figures
2025-02-24 02:44:01,041 - INFO - Created directory: ../logs
2025-02-24 02:44:01,074 - INFO - Loaded configuration from file


In [10]:
#%% [markdown]
# # 2. Study Protocol Documentation
#%%
# Study Protocol Documentation
protocol = {
    "title": "Causal Effect of Negative Lab Tests on Healthcare Utilization",
    "version": "1.0",
    "date": "2025-02-24",
    "study_id": "SSD-CPCSSN-2025-001",
    
    "temporal_windows": {
        "baseline": "2018-01-01 to 2018-06-30",
        "treatment": "2018-07-01 to 2019-06-30",
        "outcome": "2019-07-01 to 2020-06-30"
    },
    
    "variables": {
        "treatment": {
            "name": "Negative Lab Cascade",
            "definition": "≥3 normal lab results in 12 months",
            "type": "binary",
            "source": "Lab_prepared.csv"
        },
        "mediator": {
            "name": "SSD Severity Score",
            "components": [
                "Symptom code frequency (ICD-9: 780-789)",
                "Visit patterns for unexplained symptoms",
                "Anxiety/depression indicators"
            ],
            "type": "continuous (0-100)"
        },
        "outcome": {
            "name": "Healthcare Utilization",
            "metrics": [
                "Total encounters",
                "ED visits",
                "Specialist referrals"
            ],
            "type": "count"
        }
    }
}

#%%
# Display protocol in formatted markdown
def display_protocol(protocol: Dict):
    """
    Create formatted display of study protocol
    """
    md_text = f"""
    # {protocol['title']}
    
    **Version:** {protocol['version']}  
    **Date:** {protocol['date']}  
    **Study ID:** {protocol['study_id']}
    
    ## Time Windows
    """
    
    for period, window in protocol['temporal_windows'].items():
        md_text += f"\n**{period.title()}:** {window}"
    
    md_text += "\n\n## Variables\n"
    
    for var_type, details in protocol['variables'].items():
        md_text += f"\n### {var_type.title()}: {details['name']}\n"
        md_text += f"- **Type:** {details['type']}\n"
        
        if 'components' in details:
            md_text += "- **Components:**\n"
            for comp in details['components']:
                md_text += f"  * {comp}\n"
        
        if 'metrics' in details:
            md_text += "- **Metrics:**\n"
            for metric in details['metrics']:
                md_text += f"  * {metric}\n"

    display(Markdown(md_text))

display_protocol(protocol)

#%% [markdown]
# # 3. Data Validation Check

#%%
def check_required_files():
    """
    Verify existence and basic properties of required files
    """
    required_files = [
        'PatientDemographic_merged_prepared.csv',
        'Lab_prepared.csv',
        'Encounter_prepared.csv',
        'EncounterDiagnosis_prepared.csv'
    ]
    
    results = {}
    data_path = Path(config['data_paths']['prepared_data'])

    # First check if the directory exists
    if not data_path.exists():
        print(f"Directory does not exist: {data_path}")
        return results
        
    print(f"Checking files in: {data_path}")
    
    for file in required_files:
        try:
            file_path = data_path / file
            print(f"Looking for file: {file_path}")
            
            if file_path.exists():
                # Read first few rows to check structure
                df = pd.read_csv(file_path, nrows=5)
                
                results[file] = {
                    'status': 'Available',
                    'size_mb': round(file_path.stat().st_size / (1024 * 1024), 2),
                    'columns': list(df.columns),
                    'last_modified': datetime.fromtimestamp(
                        file_path.stat().st_mtime
                    ).strftime('%Y-%m-%d %H:%M:%S')
                }
            else:
                results[file] = {'status': 'Missing'}
                
        except Exception as e:
            results[file] = {'status': 'Error', 'message': str(e)}
    
    return results

#%%
# Run data validation check
validation_results = check_required_files()

# Display results in a formatted table
def display_validation_results(results: Dict):
    """Create formatted display of validation results"""
    md_text = "## Data Validation Results\n\n"
    
    for file, details in results.items():
        md_text += f"### {file}\n"
        md_text += f"**Status:** {details['status']}\n\n"
        
        if details['status'] == 'Available':
            md_text += f"- Size: {details['size_mb']} MB\n"
            md_text += f"- Last Modified: {details['last_modified']}\n"
            md_text += "- Columns:\n"
            for col in details['columns']:
                md_text += f"  * {col}\n"
        elif details['status'] == 'Error':
            md_text += f"Error: {details['message']}\n"
            
        md_text += "\n---\n"
    
    display(Markdown(md_text))

display_validation_results(validation_results)

#%% [markdown]
# # 4. Power Analysis

#%%
def calculate_sample_size():
    """
    Calculate required sample size for causal mediation analysis
    Based on Fritz & MacKinnon (2007)
    """
    alpha = 0.05  # Type I error rate
    power = 0.80  # Desired power
    effect_size = 0.20  # Expected effect size
    
    # Sample size calculation for mediation
    from scipy.stats import norm
    
    z_alpha = norm.ppf(1 - alpha/2)
    z_beta = norm.ppf(power)
    
    n_raw = ((z_alpha + z_beta)**2) / effect_size**2
    n_adjusted = int(n_raw * 1.2)  # Adding 20% for attrition
    
    results = {
        'required_n': n_adjusted,
        'parameters': {
            'alpha': alpha,
            'power': power,
            'effect_size': effect_size,
            'attrition_adjustment': '20%'
        }
    }
    
    return results

#%%
# Run power analysis
power_results = calculate_sample_size()

# Display results
power_md = f"""
## Power Analysis Results

Required sample size: **{power_results['required_n']}** patients

### Parameters:
- Alpha (Type I error rate): {power_results['parameters']['alpha']}
- Power: {power_results['parameters']['power']}
- Expected effect size: {power_results['parameters']['effect_size']}
- Attrition adjustment: {power_results['parameters']['attrition_adjustment']}
"""

display(Markdown(power_md))

#%% [markdown]
# # 5. Save Protocol and Setup Log

#%%
def save_study_documentation():
    """
    Save all study documentation and setup information
    """
    import matplotlib
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Create documentation dictionary
    documentation = {
        'protocol': protocol,
        'power_analysis': power_results,
        'data_validation': validation_results,
        'environment': {
            'python_version': sys.version,
            'key_packages': {
                'pandas': pd.__version__,
                'numpy': np.__version__,
                'matplotlib': matplotlib.__version__
            },
            'execution_date': timestamp
        }
    }
    
    # Save as YAML
    doc_path = Path('../results') / f'study_documentation_{timestamp}.yaml'
    with open(doc_path, 'w') as f:
        yaml.dump(documentation, f)
    
    logger.info(f"Saved study documentation to {doc_path}")
    
    return doc_path

doc_path = save_study_documentation()

#%% [markdown]
# # 6. Next Steps

#%%
next_steps = """
## Next Steps

1. Proceed to data preprocessing (`01_data_validation.ipynb`):
   - Apply inclusion/exclusion criteria
   - Handle missing data
   - Create analysis dataset

2. Feature engineering (`02_feature_engineering.ipynb`):
   - Construct negative lab cascade indicator
   - Build SSD severity score
   - Calculate healthcare utilization metrics

3. Causal analysis (`03_causal_analysis.ipynb`):
   - Implement DoWhy mediation analysis
   - Conduct sensitivity analyses
   - Generate results

Remember to commit this notebook and documentation to version control before proceeding.
"""

display(Markdown(next_steps))


    # Causal Effect of Negative Lab Tests on Healthcare Utilization
    
    **Version:** 1.0  
    **Date:** 2025-02-24  
    **Study ID:** SSD-CPCSSN-2025-001
    
    ## Time Windows
    
**Baseline:** 2018-01-01 to 2018-06-30
**Treatment:** 2018-07-01 to 2019-06-30
**Outcome:** 2019-07-01 to 2020-06-30

## Variables

### Treatment: Negative Lab Cascade
- **Type:** binary

### Mediator: SSD Severity Score
- **Type:** continuous (0-100)
- **Components:**
  * Symptom code frequency (ICD-9: 780-789)
  * Visit patterns for unexplained symptoms
  * Anxiety/depression indicators

### Outcome: Healthcare Utilization
- **Type:** count
- **Metrics:**
  * Total encounters
  * ED visits
  * Specialist referrals


Checking files in: C:\Users\ProjectC4M\Documents\CPCSSN Datasets Care4Mind\New Extraction Feb 2025\SSD_Experiment1_Causal_Effect\100k_sample
Looking for file: C:\Users\ProjectC4M\Documents\CPCSSN Datasets Care4Mind\New Extraction Feb 2025\SSD_Experiment1_Causal_Effect\100k_sample\PatientDemographic_merged_prepared.csv
Looking for file: C:\Users\ProjectC4M\Documents\CPCSSN Datasets Care4Mind\New Extraction Feb 2025\SSD_Experiment1_Causal_Effect\100k_sample\Lab_prepared.csv
Looking for file: C:\Users\ProjectC4M\Documents\CPCSSN Datasets Care4Mind\New Extraction Feb 2025\SSD_Experiment1_Causal_Effect\100k_sample\Encounter_prepared.csv
Looking for file: C:\Users\ProjectC4M\Documents\CPCSSN Datasets Care4Mind\New Extraction Feb 2025\SSD_Experiment1_Causal_Effect\100k_sample\EncounterDiagnosis_prepared.csv


## Data Validation Results

### PatientDemographic_merged_prepared.csv
**Status:** Available

- Size: 11.35 MB
- Last Modified: 2025-02-20 19:36:06
- Columns:
  * PatientDemographic_ID
  * Network_ID
  * Site_ID
  * Patient_ID
  * Cycle_ID
  * Occupation
  * HighestEducation
  * HousingStatus
  * ResidencePostalCode
  * PatientStatus_orig
  * PatientStatus_calc
  * Language
  * Ethnicity
  * DeceasedYear
  * DateCreated
  * BirthYear
  * BirthMonth
  * OptedOut
  * OptOutDate
  * Sex

---
### Lab_prepared.csv
**Status:** Available

- Size: 0.16 MB
- Last Modified: 2025-02-20 18:43:49
- Columns:
  * Lab_ID
  * Network_ID
  * Site_ID
  * Patient_ID
  * Encounter_ID
  * Cycle_ID
  * PerformedDate
  * Name_orig
  * Name_calc
  * CodeType_orig
  * CodeType_calc
  * Code_orig
  * Code_calc
  * TestResult_orig
  * TestResult_calc
  * UpperNormal
  * LowerNormal
  * NormalRange
  * UnitOfMeasure_orig
  * UnitOfMeasure_calc
  * DateCreated

---
### Encounter_prepared.csv
**Status:** Available

- Size: 390.9 MB
- Last Modified: 2025-02-20 18:43:33
- Columns:
  * Encounter_ID
  * Network_ID
  * Site_ID
  * Patient_ID
  * Provider_ID
  * Cycle_ID
  * EncounterDate
  * Reason_orig
  * Reason_calc
  * EncounterType
  * DateCreated

---
### EncounterDiagnosis_prepared.csv
**Status:** Available

- Size: 504.88 MB
- Last Modified: 2025-02-20 18:43:45
- Columns:
  * EncounterDiagnosis_ID
  * Network_ID
  * Site_ID
  * Patient_ID
  * Encounter_ID
  * Cycle_ID
  * DiagnosisText_orig
  * DiagnosisText_calc
  * DiagnosisCodeType_orig
  * DiagnosisCodeType_calc
  * DiagnosisCode_orig
  * DiagnosisCode_calc
  * DateCreated

---



## Power Analysis Results

Required sample size: **235** patients

### Parameters:
- Alpha (Type I error rate): 0.05
- Power: 0.8
- Expected effect size: 0.2
- Attrition adjustment: 20%


2025-02-24 02:44:01,131 - INFO - Saved study documentation to ..\results\study_documentation_20250224_024401.yaml



## Next Steps

1. Proceed to data preprocessing (`01_data_validation.ipynb`):
   - Apply inclusion/exclusion criteria
   - Handle missing data
   - Create analysis dataset

2. Feature engineering (`02_feature_engineering.ipynb`):
   - Construct negative lab cascade indicator
   - Build SSD severity score
   - Calculate healthcare utilization metrics

3. Causal analysis (`03_causal_analysis.ipynb`):
   - Implement DoWhy mediation analysis
   - Conduct sensitivity analyses
   - Generate results

Remember to commit this notebook and documentation to version control before proceeding.
