# CPCSSN Mental Health Data Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml
import time

# Import the local pipeline
# from data_preparation import DataPreparationPipeline

In [2]:
# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme(style="whitegrid")

In [5]:
# 1) Check directories
for folder in ['logs', 'prepared_data', 'extracted_data']:
    Path(folder).mkdir(exist_ok=True)

# 2) Load config
config_path = Path('config.yaml')
assert config_path.exists(), "config.yaml not found!"

with open(config_path, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

print("Configuration loaded:")
print(config)

# 3) Initialize pipeline
pipeline = DataPreparationPipeline(
    config_path=str(config_path),
    data_dir=config['data_paths']['raw_data']
)
print("Pipeline initialized.")
print(f"use_gpu = {pipeline.use_gpu}")
print(f"chunk_size = {pipeline.chunk_size}")

# 4) Attempt to load data
table_names = list(config['tables'].keys())
tables = {}
loading_log = []

start_all = time.time()
print("\n=== Loading Tables ===")
for tname in table_names:
    print(f"\nLoading table: {tname}")
    start_t = time.time()
    try:
        df = pipeline.load_table(tname)
        tables[tname] = df
        load_time = time.time() - start_t
        loading_log.append({
            "table": tname,
            "rows": len(df),
            "columns": len(df.columns),
            "time_sec": round(load_time,2)
        })
        print(f"  -> Done: {len(df)} rows, {len(df.columns)} cols in {load_time:.2f}s")
    except Exception as e:
        print(f"  !! Error: {e}")
end_all = time.time()
print(f"\nTotal load time: {round(end_all - start_all,2)}s")

# 5) Summarize loading
load_report = pd.DataFrame(loading_log)
print("\n=== Loading Summary ===")
print(load_report)

Configuration loaded:
{'data_paths': {'raw_data': 'extracted_data/', 'prepared_data': 'prepared_data/', 'file_extension': '.csv'}, 'use_gpu': False, 'chunk_size': 500000, 'tables': {'Patient': {'filename': 'C4MPatient', 'required_columns': ['Patient_ID', 'Sex', 'BirthYear', 'BirthMonth', 'OptedOut', 'OptOutDate'], 'date_columns': ['OptOutDate']}, 'PatientDemographic': {'filename': 'C4MPatientDemographic', 'required_columns': ['PatientDemographic_ID', 'Patient_ID', 'Network_ID', 'Site_ID', 'Cycle_ID', 'Occupation', 'HousingStatus', 'DateCreated'], 'date_columns': ['DateCreated']}, 'Encounter': {'filename': 'C4MEncounter', 'required_columns': ['Encounter_ID', 'Patient_ID', 'Network_ID', 'Site_ID', 'Provider_ID', 'EncounterDate', 'Reason_orig', 'Reason_calc'], 'date_columns': ['EncounterDate', 'DateCreated']}, 'EncounterDiagnosis': {'filename': 'C4MEncounterDiagnosis', 'required_columns': ['EncounterDiagnosis_ID', 'Network_ID', 'Site_ID', 'Patient_ID', 'Encounter_ID', 'Cycle_ID', 'Diagnos

2025-02-13 00:57:00,975 - INFO - Loading table: Patient from extracted_data\C4MPatient.csv
2025-02-13 00:57:00,975 - INFO - Reading extracted_data\C4MPatient.csv in chunks of size=500000



=== Loading Tables ===

Loading table: Patient


2025-02-13 00:57:02,985 - INFO - Successfully loaded Patient: 352161 rows
2025-02-13 00:57:03,083 - INFO - Loading table: PatientDemographic from extracted_data\C4MPatientDemographic.csv
2025-02-13 00:57:03,083 - INFO - Reading extracted_data\C4MPatientDemographic.csv in chunks of size=500000


  -> Done: 352161 rows, 6 cols in 2.11s

Loading table: PatientDemographic


2025-02-13 00:57:06,022 - INFO - Successfully loaded PatientDemographic: 352220 rows
2025-02-13 00:57:06,022 - INFO - Loading table: Encounter from extracted_data\C4MEncounter.csv
2025-02-13 00:57:06,022 - INFO - Reading extracted_data\C4MEncounter.csv in chunks of size=500000


  -> Done: 352220 rows, 15 cols in 2.94s

Loading table: Encounter


2025-02-13 00:58:43,041 - INFO - Successfully loaded Encounter: 11577739 rows
2025-02-13 00:58:43,042 - INFO - Loading table: EncounterDiagnosis from extracted_data\C4MEncounterDiagnosis.csv
2025-02-13 00:58:43,042 - INFO - Reading extracted_data\C4MEncounterDiagnosis.csv in chunks of size=500000


  -> Done: 11577739 rows, 11 cols in 97.02s

Loading table: EncounterDiagnosis


2025-02-13 00:58:46,009 - ERROR - Error loading EncounterDiagnosis: '|' expected after '"'
2025-02-13 00:58:46,094 - INFO - Loading table: FamilyHistory from extracted_data\C4MFamilyHistory.csv
2025-02-13 00:58:46,095 - INFO - Reading extracted_data\C4MFamilyHistory.csv in chunks of size=500000


  !! Error: '|' expected after '"'

Loading table: FamilyHistory


2025-02-13 00:58:49,507 - INFO - Successfully loaded FamilyHistory: 325202 rows
2025-02-13 00:58:49,508 - INFO - Loading table: HealthCondition from extracted_data\C4MHealthCondition.csv
2025-02-13 00:58:49,509 - INFO - Reading extracted_data\C4MHealthCondition.csv in chunks of size=500000


  -> Done: 325202 rows, 20 cols in 3.41s

Loading table: HealthCondition


2025-02-13 00:58:50,173 - ERROR - Error loading HealthCondition: '|' expected after '"'
2025-02-13 00:58:50,207 - INFO - Loading table: Lab from extracted_data\C4MLab.csv
2025-02-13 00:58:50,207 - INFO - Reading extracted_data\C4MLab.csv in chunks of size=500000


  !! Error: '|' expected after '"'

Loading table: Lab


2025-02-13 00:58:56,847 - ERROR - Error loading Lab: '|' expected after '"'
2025-02-13 00:58:57,005 - INFO - Loading table: MedicalProcedure from extracted_data\C4MMedicalProcedure.csv
2025-02-13 00:58:57,005 - INFO - Reading extracted_data\C4MMedicalProcedure.csv in chunks of size=500000


  !! Error: '|' expected after '"'

Loading table: MedicalProcedure


2025-02-13 00:59:07,392 - INFO - Successfully loaded MedicalProcedure: 1203002 rows
2025-02-13 00:59:07,394 - INFO - Loading table: Medication from extracted_data\C4MMedication.csv
2025-02-13 00:59:07,394 - INFO - Reading extracted_data\C4MMedication.csv in chunks of size=500000


  -> Done: 1203002 rows, 10 cols in 10.39s

Loading table: Medication


2025-02-13 00:59:10,694 - ERROR - Error loading Medication: '|' expected after '"'
2025-02-13 00:59:10,823 - INFO - Loading table: Referral from extracted_data\C4MReferral.csv
2025-02-13 00:59:10,823 - INFO - Reading extracted_data\C4MReferral.csv in chunks of size=500000


  !! Error: '|' expected after '"'

Loading table: Referral


2025-02-13 00:59:20,731 - INFO - Successfully loaded Referral: 1141061 rows
2025-02-13 00:59:20,731 - INFO - Loading table: RiskFactor from extracted_data\C4MRiskFactor.csv
2025-02-13 00:59:20,735 - INFO - Reading extracted_data\C4MRiskFactor.csv in chunks of size=500000


  -> Done: 1141061 rows, 12 cols in 9.91s

Loading table: RiskFactor


2025-02-13 00:59:21,659 - ERROR - Error loading RiskFactor: '|' expected after '"'


  !! Error: '|' expected after '"'

Total load time: 140.73s

=== Loading Summary ===
                table      rows  columns  time_sec
0             Patient    352161        6      2.11
1  PatientDemographic    352220       15      2.94
2           Encounter  11577739       11     97.02
3       FamilyHistory    325202       20      3.41
4    MedicalProcedure   1203002       10     10.39
5            Referral   1141061       12      9.91


In [6]:
import cudf
print(cudf.__version__)


ModuleNotFoundError: No module named 'cudf'

In [4]:
# Create necessary directories
dirs_to_create = ['logs', 'prepared_data', 'figures', 'extracted_data']
for dir_name in dirs_to_create:
    Path(dir_name).mkdir(exist_ok=True)
    print(f"Confirmed directory exists: {dir_name}")


Confirmed directory exists: logs
Confirmed directory exists: prepared_data
Confirmed directory exists: figures
Confirmed directory exists: extracted_data


In [5]:
# Verify config file exists before loading
config_path = Path('config.yaml')
if not config_path.exists():
    raise FileNotFoundError("config.yaml not found in current directory")

# Load configuration with error handling
try:
    with open(config_path, 'r', encoding='utf-8') as file:
        config = yaml.safe_load(file)
    print("Successfully loaded configuration")
except Exception as e:
    raise Exception(f"Error loading config.yaml: {str(e)}")

# Verify data directory structure
data_dir = Path('extracted_data')
if not data_dir.exists():
    raise FileNotFoundError("extracted_data directory not found")

# List expected files based on config
expected_files = [f"{table_config['filename']}.txt" 
                 for table_config in config['tables'].values()]

# Check for missing files
missing_files = [file for file in expected_files 
                if not (data_dir / file).exists()]

if missing_files:
    print("\nWarning: The following files are missing:")
    for file in missing_files:
        print(f"- {file}")
else:
    print("\nAll expected data files are present")

Successfully loaded configuration

- C4MPatient.txt
- C4MPatientDemographic.txt
- C4MEncounter.txt
- C4MEncounterDiagnosis.txt
- C4MHealthCondition.txt
- C4MLab.txt
- C4MMedicalProcedure.txt
- C4MMedication.txt
- C4MRiskFactor.txt
- C4MReferral.txt
- C4MFamilyHistory.txt


In [6]:
# Initialize pipeline with error handling
try:
    pipeline = DataPreparationPipeline(
        config_path=str(config_path),
        data_dir=str(data_dir)
    )
    print("\nSuccessfully initialized data preparation pipeline")
except Exception as e:
    raise Exception(f"Error initializing pipeline: {str(e)}")

# Print initial configuration summary
print("\nConfiguration Summary:")
print("-" * 50)
print(f"Number of tables configured: {len(config['tables'])}")
print(f"Sampling size: {config['sampling']['n_patients']} patients")
print(f"Data paths:")
for key, path in config['data_paths'].items():
    print(f"- {key}: {path}")



Successfully initialized data preparation pipeline

Configuration Summary:
--------------------------------------------------
Number of tables configured: 11
Sampling size: 2000 patients
Data paths:
- raw_data: extracted_data/
- prepared_data: prepared_data/
- file_extension: .csv


In [7]:
# Function to check table availability
def check_table_exists(table_name: str) -> bool:
    """Check if a table file exists in the data directory."""
    filename = config['tables'][table_name]['filename']
    file_ext = config['data_paths']['file_extension']
    return (data_dir / f"{filename}{file_ext}").exists()

# Print table availability status
print("\nTable Availability Status:")
print("-" * 50)
for table_name in config['tables']:
    status = "Available" if check_table_exists(table_name) else "Missing"
    print(f"{table_name}: {status}")

print("\nInitialization complete - ready to proceed with data loading")


Table Availability Status:
--------------------------------------------------
Patient: Available
PatientDemographic: Available
Encounter: Available
EncounterDiagnosis: Available
HealthCondition: Available
Lab: Available
MedicalProcedure: Available
Medication: Available
RiskFactor: Available
Referral: Available
FamilyHistory: Available

Initialization complete - ready to proceed with data loading


## Data Load and Initial Stats

In [8]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from data_preparation import DataPreparationPipeline
import yaml
from datetime import datetime, timedelta
import time
import warnings
warnings.filterwarnings('ignore')


# 1. Data Loading
print("Loading tables...")
tables = {}
loading_summary = []

for table_name in config['tables'].keys():
    try:
        print(f"\nLoading {table_name}...")
        start_time = datetime.now()
        
        # Load table
        tables[table_name] = pipeline.load_table(table_name)
        
        # Calculate loading statistics
        duration = (datetime.now() - start_time).total_seconds()
        memory_usage = tables[table_name].memory_usage(deep=True).sum() / 1024**2  # MB
        
        loading_summary.append({
            'Table': table_name,
            'Rows': len(tables[table_name]),
            'Columns': len(tables[table_name].columns),
            'Memory (MB)': f"{memory_usage:.2f}",
            'Load Time (s)': f"{duration:.2f}",
            'Status': 'Success'
        })
        
        print(f"Loaded {len(tables[table_name])} rows and {len(tables[table_name].columns)} columns")
        print(f"Memory usage: {memory_usage:.2f} MB")
        print(f"Loading time: {duration:.2f} seconds")
        
    except Exception as e:
        print(f"Error loading {table_name}: {str(e)}")
        loading_summary.append({
            'Table': table_name,
            'Rows': 0,
            'Columns': 0,
            'Memory (MB)': 0,
            'Load Time (s)': 0,
            'Status': f'Failed: {str(e)}'
        })

# Display loading summary
summary_df = pd.DataFrame(loading_summary)
print("\nLoading Summary:")
print("=" * 80)
print(summary_df.to_string(index=False))

# 2. Initial Data Quality Check
print("\nChecking data quality...")
quality_summary = []

for table_name, df in tables.items():
    # Calculate basic statistics
    missing_pct = df.isnull().mean().mean() * 100
    duplicate_rows = df.duplicated().sum()
    memory_usage = df.memory_usage(deep=True).sum() / 1024**2
    
    # Get date columns for this table
    date_cols = config['tables'][table_name].get('date_columns', [])
    date_range = {}
    
    # Check date ranges if date columns exist
    for col in date_cols:
        if col in df.columns:
            try:
                df[col] = pd.to_datetime(df[col])
                date_range[col] = {
                    'min': df[col].min(),
                    'max': df[col].max()
                }
            except Exception as e:
                date_range[col] = f"Error converting dates: {str(e)}"
    
    quality_summary.append({
        'Table': table_name,
        'Missing Data %': f"{missing_pct:.2f}%",
        'Duplicate Rows': duplicate_rows,
        'Memory (MB)': f"{memory_usage:.2f}",
        'Date Ranges': date_range
    })

print("\nQuality Summary:")
print("=" * 80)
for entry in quality_summary:
    print(f"\nTable: {entry['Table']}")
    print(f"Missing Data: {entry['Missing Data %']}")
    print(f"Duplicate Rows: {entry['Duplicate Rows']}")
    print(f"Memory Usage: {entry['Memory (MB']} MB")
    if entry['Date Ranges']:
        print("Date Ranges:")
        for col, range_info in entry['Date Ranges'].items():
            print(f"  {col}: {range_info}")

# 3. Sample size verification
print("\nVerifying patient sample size...")
if 'Patient' in tables:
    patient_count = len(tables['Patient'])
    print(f"Total patients in dataset: {patient_count}")
    
    if 'sampling' in config and 'n_patients' in config['sampling']:
        target_sample = config['sampling']['n_patients']
        print(f"Target sample size: {target_sample}")
        if patient_count > target_sample:
            print("Will proceed with sampling in next step")
        else:
            print("Warning: Current patient count is less than target sample size")

# 4. Save initial statistics
loading_stats = pd.DataFrame(loading_summary)
loading_stats.to_csv('prepared_data/loading_statistics.csv', index=False)

print("\nInitial data assessment complete. Ready to proceed with sampling and detailed analysis.")

2025-02-13 00:28:20,009 - INFO - Loading table: Patient from extracted_data\C4MPatient.csv


Loading tables...

Loading Patient...


2025-02-13 00:28:26,020 - ERROR - Error loading Patient: No columns to parse from file
2025-02-13 00:28:26,022 - INFO - Loading table: PatientDemographic from extracted_data\C4MPatientDemographic.csv


Error loading Patient: No columns to parse from file

Loading PatientDemographic...


2025-02-13 00:28:32,120 - ERROR - Error loading PatientDemographic: No columns to parse from file
2025-02-13 00:28:32,120 - INFO - Loading table: Encounter from extracted_data\C4MEncounter.csv


Error loading PatientDemographic: No columns to parse from file

Loading Encounter...


2025-02-13 00:31:42,663 - ERROR - Error loading Encounter: No columns to parse from file
2025-02-13 00:31:42,663 - INFO - Loading table: EncounterDiagnosis from extracted_data\C4MEncounterDiagnosis.csv


Error loading Encounter: No columns to parse from file

Loading EncounterDiagnosis...


KeyboardInterrupt: 