# Cleaning Filevine Contact Information

In [1]:
import logging
import numpy as np
import os
import pandas as pd
from typing import Any, Callable, Dict, List

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
def clean_phone_number(phone_number: str) -> str:
    '''Cleans phone number by removing spaces, parentheses, and dashes.
       Formats to (XXX) XXX-XXXX if 10 digits.
    '''
    if pd.isna(phone_number):
        return phone_number
    phone_number = phone_number.replace('-', '').replace(' ', '').replace('(', '').replace(')', '')
    if len(phone_number) == 10 and phone_number.isdigit():
        return f"({phone_number[:3]}) {phone_number[3:6]}-{phone_number[6:]}"
    else:
        return phone_number
    

def clean_address(address: str) -> str:
    '''Cleans address by removing redundant commas and extra spaces.'''
    if pd.isna(address):
        return address
    address = address.replace(', ,', ',').replace('  ', ' ').strip()
    return address


def clean_geocode(value) -> float:
    '''Cleans geocode values by handling various input types and validating coordinate ranges.'''
    if pd.isna(value):
        return np.nan
    
    # Convert to string safely
    value_str = str(value).replace('--', '-')
    
    try:
        result = float(value_str)
        # Validate coordinate ranges (latitude: -90 to 90, longitude: -180 to 180)
        return result if -180 <= result <= 180 else np.nan
    except (ValueError, TypeError):
        return np.nan


def validate_data_quality(df: pd.DataFrame, data_type: str) -> Dict[str, Any]:
    """Validate data quality and return metrics."""
    quality_metrics = {
        'total_records': len(df),
        'missing_names': df.get('Full Name', pd.Series()).isna().sum(),
        'missing_addresses': df.get('Work Address', pd.Series()).isna().sum(),
        'missing_phones': df.get('Work Phone', pd.Series()).isna().sum(),
        'invalid_geocodes': 0
    }
    
    # Check geocode validity
    if 'Latitude' in df.columns and 'Longitude' in df.columns:
        invalid_lat = (df['Latitude'].notna()) & ((df['Latitude'] < -90) | (df['Latitude'] > 90))
        invalid_lng = (df['Longitude'].notna()) & ((df['Longitude'] < -180) | (df['Longitude'] > 180))
        quality_metrics['invalid_geocodes'] = (invalid_lat | invalid_lng).sum()
    
    # Log quality metrics
    logging.info(f"{data_type} - Quality Metrics: {quality_metrics}")
    
    return quality_metrics


def clean_referral_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Apply all cleaning functions to a standardized referral dataframe."""
    df = df.copy()
    
    # Clean columns if they exist
    if 'Work Phone' in df.columns:
        df['Work Phone'] = df['Work Phone'].apply(clean_phone_number)
    if 'Work Address' in df.columns:
        df['Work Address'] = df['Work Address'].apply(clean_address)
    if 'Latitude' in df.columns:
        df['Latitude'] = df['Latitude'].apply(clean_geocode)
    if 'Longitude' in df.columns:
        df['Longitude'] = df['Longitude'].apply(clean_geocode)
    
    # Sort and set index
    if 'Date of Intake' in df.columns and 'Full Name' in df.columns:
        df = df.sort_values(by=['Date of Intake', 'Full Name'], ascending=True)
        df = df.set_index('Date of Intake')
    
    return df


def process_referral_data(df: pd.DataFrame, column_mapping: Dict[str, str], 
                         filter_conditions: List[Callable] = None) -> pd.DataFrame:
    """Generic function to process referral data with column mapping."""
    # Select and rename columns
    processed_df = df[list(column_mapping.keys())].copy()
    processed_df = processed_df.rename(columns=column_mapping)
    
    # Apply filters if provided
    if filter_conditions:
        for condition in filter_conditions:
            processed_df = processed_df[condition(processed_df)].reset_index(drop=True)
    
    # Clean the data
    processed_df = clean_referral_dataframe(processed_df)
    
    return processed_df

def process_all_referrals(df_all: pd.DataFrame, configs: Dict) -> Dict[str, pd.DataFrame]:
    """Process all referral types using configuration."""
    results = {}
    for referral_type, config in configs.items():
        logging.info(f"Processing {referral_type} referrals...")
        results[referral_type] = process_referral_data(
            df_all, 
            config['columns'], 
            config.get('filters')
        )
        # Validate data quality
        validate_data_quality(results[referral_type], referral_type)
    return results

# Add column validation and schema validation functions
def validate_column_existence(df: pd.DataFrame, config_name: str, required_columns: list) -> bool:
    """Validate that all required columns exist in the dataset."""
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        logging.error(f"❌ Missing columns for {config_name}: {missing_cols}")
        return False
    logging.info(f"✅ All required columns found for {config_name}")
    return True


def validate_output_schema(df: pd.DataFrame, data_type: str) -> bool:
    """Validate that output matches expected schema for main application."""
    # Expected columns based on the main application's provider loading functions
    expected_base_cols = ['Full Name', 'Work Phone', 'Work Address', 'Latitude', 'Longitude']
    
    missing_cols = [col for col in expected_base_cols if col not in df.columns]
    if missing_cols:
        logging.error(f"❌ Missing expected columns in {data_type}: {missing_cols}")
        return False
    
    # Check data types
    if 'Latitude' in df.columns:
        non_numeric_lat = df['Latitude'].notna() & ~pd.to_numeric(df['Latitude'], errors='coerce').notna()
        if non_numeric_lat.any():
            logging.warning(f"⚠️ Non-numeric latitude values found in {data_type}: {non_numeric_lat.sum()}")
    
    if 'Longitude' in df.columns:
        non_numeric_lng = df['Longitude'].notna() & ~pd.to_numeric(df['Longitude'], errors='coerce').notna()
        if non_numeric_lng.any():
            logging.warning(f"⚠️ Non-numeric longitude values found in {data_type}: {non_numeric_lng.sum()}")
    
    logging.info(f"✅ Output schema validation passed for {data_type}")
    return True

In [3]:
# Configuration dictionary for different referral types
REFERRAL_CONFIGS = {
    'primary_inbound': {
        'columns': {
            'Project ID': 'Project ID',
            'Date of Intake': 'Date of Intake',
            'Referral Source': 'Referral Source',
            'Referred From Full Name': 'Full Name',
            "Referred From's Work Phone": 'Work Phone',
            "Referred From's Work Address": 'Work Address',
            "Referred From's Details: Latitude": 'Latitude',
            "Referred From's Details: Longitude": 'Longitude',
        },
        'filters': [
            lambda df: df['Referral Source'] == "Referral - Doctor's Office",
            lambda df: df['Full Name'].notna(),
            lambda df: df['Work Address'].notna()
        ]
    },
    'secondary_inbound': {
        'columns': {
            'Project ID': 'Project ID',
            'Date of Intake': 'Date of Intake',
            'Secondary Referral Source': 'Referral Source',
            'Secondary Referred From Full Name': 'Full Name',
            "Secondary Referred From's Work Phone": 'Work Phone',
            "Secondary Referred From's Work Address": 'Work Address',
            "Secondary Referred From's Details: Latitude": 'Latitude',
            "Secondary Referred From's Details: Longitude": 'Longitude'
        },
        'filters': [
            lambda df: df['Referral Source'] == "Referral - Doctor's Office",
            lambda df: df['Full Name'].notna(),
            lambda df: df['Work Address'].notna()
        ]
    },
    'outbound': {
        'columns': {
            'Project ID': 'Project ID',
            'Date of Intake': 'Date of Intake',
            'Dr/Facility Referred To Full Name': 'Full Name',
            "Dr/Facility Referred To's Work Phone": 'Work Phone',
            "Dr/Facility Referred To's Work Address": 'Work Address',
            "Dr/Facility Referred To's Details: Latitude": 'Latitude',
            "Dr/Facility Referred To's Details: Longitude": 'Longitude'
        },
        'filters': [
            lambda df: df['Full Name'].notna()
        ]
    }
}

## Load Full Dataset

***Reminder:*** `.gitignore` set to ignore the source folder for data privacy!

In [4]:
df_all = pd.read_excel('../data/raw/Referrals_App_Full_Contacts.xlsx')
df_all

Unnamed: 0,Project ID,Create Date,Date of Intake,Referral Source,Referred From Full Name,Referred From's Work Phone,Referred From's Work Address,Referred From's Details: Latitude,Referred From's Details: Longitude,Secondary Referral Source,Secondary Referred From Full Name,Secondary Referred From's Work Phone,Secondary Referred From's Work Address,Secondary Referred From's Details: Latitude,Secondary Referred From's Details: Longitude,Dr/Facility Referred To Full Name,Dr/Facility Referred To's Work Phone,Dr/Facility Referred To's Work Address,Dr/Facility Referred To's Details: Latitude,Dr/Facility Referred To's Details: Longitude
0,991278220,2005-01-29 12:53:07,,Other,,,,,,,,,,,,,,,,
1,991278219,2005-01-29 12:53:07,,Referral - Attorney,,,,,,,,,,,,,,,,
2,991278217,2005-01-29 12:53:07,,Referral - Attorney,,,,,,,,,,,,,,,,
3,991278216,2005-01-29 12:53:07,,Other,,,,,,,,,,,,,,,,
4,991278210,2005-01-29 12:53:07,,Other,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11594,992326821,2025-09-11 11:07:22,45911.0,Google Business Profile,,,,,,,,,,,,,,,,
11595,992326822,2025-09-11 11:08:18,45910.0,Referral - Client,Erica P. Thompson,,,,,,,,,,,,,,,
11596,992327067,2025-09-11 14:49:02,45911.0,Google Organic Search,,,,,,,,,,,,,,,,
11597,992327094,2025-09-11 15:21:48,45909.0,Other,,,,,,,,,,,,Effective Integrative Healthcare - Millersvill...,410-928-4192,"683 Old Mill Rd, , Millersville, MD 21108",39.11715,-76.631614


In [5]:
# Convert Excel date integers to pandas datetime
# Excel stores dates as days since 1900-01-01 (with 1900 leap year bug)
df_all['Date of Intake'] = pd.to_datetime(df_all['Date of Intake'], unit='D', origin='1899-12-30')
df_all['Date of Intake']

0              NaT
1              NaT
2              NaT
3              NaT
4              NaT
           ...    
11594   2025-09-11
11595   2025-09-10
11596   2025-09-11
11597   2025-09-09
11598   2025-09-12
Name: Date of Intake, Length: 11599, dtype: datetime64[ns]

In [6]:
# Keep both as datetime for consistency
df_all['Create Date'] = pd.to_datetime(df_all['Create Date'], unit='D')

In [7]:
df_all['Date of Intake'] = df_all['Date of Intake'].fillna(df_all['Create Date'])
df_all['Date of Intake']

0       2005-01-29 12:53:07
1       2005-01-29 12:53:07
2       2005-01-29 12:53:07
3       2005-01-29 12:53:07
4       2005-01-29 12:53:07
                ...        
11594   2025-09-11 00:00:00
11595   2025-09-10 00:00:00
11596   2025-09-11 00:00:00
11597   2025-09-09 00:00:00
11598   2025-09-12 00:00:00
Name: Date of Intake, Length: 11599, dtype: datetime64[ns]

In [8]:
df_all.isna().sum()

Project ID                                          0
Create Date                                         0
Date of Intake                                      0
Referral Source                                   187
Referred From Full Name                         11045
Referred From's Work Phone                      11336
Referred From's Work Address                    11460
Referred From's Details: Latitude               11511
Referred From's Details: Longitude              11511
Secondary Referral Source                        9859
Secondary Referred From Full Name               11546
Secondary Referred From's Work Phone            11576
Secondary Referred From's Work Address          11582
Secondary Referred From's Details: Latitude     11586
Secondary Referred From's Details: Longitude    11586
Dr/Facility Referred To Full Name               11220
Dr/Facility Referred To's Work Phone            11224
Dr/Facility Referred To's Work Address          11237
Dr/Facility Referred To's De

In [9]:
# Validate column existence before processing
print("🔍 VALIDATING DATA STRUCTURE:")
print("="*50)
print(f"📊 Dataset shape: {df_all.shape}")
print(f"📋 Available columns: {len(df_all.columns)}")

for config_name, config in REFERRAL_CONFIGS.items():
    required_cols = list(config['columns'].keys())
    is_valid = validate_column_existence(df_all, config_name, required_cols)
    if not is_valid:
        print(f"❌ Validation failed for {config_name}")
    else:
        print(f"✅ {config_name} configuration validated")

2025-09-17 04:26:18,610 - INFO - ✅ All required columns found for primary_inbound
2025-09-17 04:26:18,610 - INFO - ✅ All required columns found for secondary_inbound
2025-09-17 04:26:18,610 - INFO - ✅ All required columns found for outbound


🔍 VALIDATING DATA STRUCTURE:
📊 Dataset shape: (11599, 20)
📋 Available columns: 20
✅ primary_inbound configuration validated
✅ secondary_inbound configuration validated
✅ outbound configuration validated


## Streamlined Referral Processing

Using the new configuration-driven approach to process all referral types efficiently.

### Process All Referral Types with Configuration

In [10]:
# Process all referral types using the configuration-driven approach
referral_results = process_all_referrals(df_all, REFERRAL_CONFIGS)

# Extract individual dataframes
df_primary_inbound = referral_results['primary_inbound']
df_secondary_inbound = referral_results['secondary_inbound']
df_outbound = referral_results['outbound']

print("Processing complete! Individual datasets created:")
print(f"Primary inbound referrals: {len(df_primary_inbound)} records")
print(f"Secondary inbound referrals: {len(df_secondary_inbound)} records")
print(f"Outbound referrals: {len(df_outbound)} records")

2025-09-17 04:26:18,620 - INFO - Processing primary_inbound referrals...
2025-09-17 04:26:18,629 - INFO - primary_inbound - Quality Metrics: {'total_records': 79, 'missing_names': 0, 'missing_addresses': 0, 'missing_phones': 0, 'invalid_geocodes': 0}
2025-09-17 04:26:18,632 - INFO - Processing secondary_inbound referrals...
2025-09-17 04:26:18,637 - INFO - secondary_inbound - Quality Metrics: {'total_records': 1, 'missing_names': 0, 'missing_addresses': 0, 'missing_phones': 0, 'invalid_geocodes': 0}
2025-09-17 04:26:18,637 - INFO - Processing outbound referrals...
2025-09-17 04:26:18,645 - INFO - outbound - Quality Metrics: {'total_records': 379, 'missing_names': 0, 'missing_addresses': 17, 'missing_phones': 4, 'invalid_geocodes': 0}


Processing complete! Individual datasets created:
Primary inbound referrals: 79 records
Secondary inbound referrals: 1 records
Outbound referrals: 379 records


### Create Combined Dataset with Referral Type Indicator

In [11]:
# Combine inbound referrals (primary + secondary)
df_inbound_combined = pd.concat([df_primary_inbound, df_secondary_inbound], ignore_index=False)
df_inbound_combined['referral_type'] = 'inbound'

# Add referral type indicator to outbound
df_outbound['referral_type'] = 'outbound'

# For outbound, add a referral source column since they don't have one
df_outbound['Referral Source'] = 'Outbound Referral'

# Combine all referrals into a single dataset
df_all_referrals = pd.concat([df_inbound_combined, df_outbound], ignore_index=False)

# Validate the combined dataset
validate_data_quality(df_all_referrals, "Combined All Referrals")

print(f"Combined dataset created with {len(df_all_referrals)} total records")
print(f"Inbound referrals: {len(df_all_referrals[df_all_referrals['referral_type'] == 'inbound'])}")
print(f"Outbound referrals: {len(df_all_referrals[df_all_referrals['referral_type'] == 'outbound'])}")

df_all_referrals.head()

2025-09-17 04:26:18,664 - INFO - Combined All Referrals - Quality Metrics: {'total_records': 459, 'missing_names': 0, 'missing_addresses': 17, 'missing_phones': 4, 'invalid_geocodes': 0}


Combined dataset created with 459 total records
Inbound referrals: 80
Outbound referrals: 379


Unnamed: 0_level_0,Project ID,Referral Source,Full Name,Work Phone,Work Address,Latitude,Longitude,referral_type
Date of Intake,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-10-03 07:32:39,991276984,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,(301) 220-0496,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632,inbound
2022-10-04 00:00:00,991275617,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,(301) 220-0496,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632,inbound
2022-10-06 11:39:51,991281240,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,(301) 220-0496,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632,inbound
2022-10-07 06:37:11,991272012,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,(301) 220-0496,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632,inbound
2022-10-07 06:37:43,991275076,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,(301) 220-0496,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632,inbound


### Save Processed Data

Save both individual datasets (for backward compatibility) and the combined dataset.

In [12]:
# Ensure data directory exists
os.makedirs('../data/processed', exist_ok=True)

# Save individual datasets for backward compatibility (using snappy compression for better compatibility)
df_inbound_combined.to_parquet('../data/processed/cleaned_inbound_referrals.parquet', 
                               compression='snappy', index=True)
df_outbound.to_parquet('../data/processed/cleaned_outbound_referrals.parquet', 
                       compression='snappy', index=True)

# Save the combined dataset (recommended for future use)
df_all_referrals.to_parquet('../data/processed/cleaned_all_referrals.parquet', 
                            compression='snappy', index=True)

# Validate output schemas
validate_output_schema(df_inbound_combined, "Inbound Referrals")
validate_output_schema(df_outbound, "Outbound Referrals") 
validate_output_schema(df_all_referrals, "Combined Referrals")

print("✅ All datasets saved successfully!")
print("📁 Files created:")
print("  - cleaned_inbound_referrals.parquet (legacy compatibility)")
print("  - cleaned_outbound_referrals.parquet (legacy compatibility)")
print("  - cleaned_all_referrals.parquet (recommended for new analysis)")

# Display summary statistics
print(f"\n📊 Summary Statistics:")
print(f"Total unique providers in combined dataset: {df_all_referrals['Full Name'].nunique()}")
print(f"Date range: {df_all_referrals.index.min().strftime('%Y-%m-%d')} to {df_all_referrals.index.max().strftime('%Y-%m-%d')}")
print(f"Providers with complete contact info: {df_all_referrals[['Full Name', 'Work Address', 'Work Phone']].notna().all(axis=1).sum()}")
print(f"Providers with geocoding: {df_all_referrals[['Latitude', 'Longitude']].notna().all(axis=1).sum()}")

2025-09-17 04:26:18,726 - INFO - ✅ Output schema validation passed for Inbound Referrals
2025-09-17 04:26:18,729 - INFO - ✅ Output schema validation passed for Outbound Referrals
2025-09-17 04:26:18,730 - INFO - ✅ Output schema validation passed for Combined Referrals


✅ All datasets saved successfully!
📁 Files created:
  - cleaned_inbound_referrals.parquet (legacy compatibility)
  - cleaned_outbound_referrals.parquet (legacy compatibility)
  - cleaned_all_referrals.parquet (recommended for new analysis)

📊 Summary Statistics:
Total unique providers in combined dataset: 71
Date range: 2022-10-03 to 2025-09-10
Providers with complete contact info: 440
Providers with geocoding: 459


In [13]:
# Integration Test: Verify data can be loaded by main application
print("🧪 INTEGRATION TEST:")
print("="*50)

try:
    # Test loading the cleaned data using pandas (simulating main app behavior)
    test_inbound = pd.read_parquet('../data/processed/cleaned_inbound_referrals.parquet')
    test_outbound = pd.read_parquet('../data/processed/cleaned_outbound_referrals.parquet')
    test_combined = pd.read_parquet('../data/processed/cleaned_all_referrals.parquet')
    
    print(f"✅ Successfully loaded cleaned datasets:")
    print(f"   📊 Inbound: {len(test_inbound)} records, columns: {list(test_inbound.columns)}")
    print(f"   📊 Outbound: {len(test_outbound)} records, columns: {list(test_outbound.columns)}")
    print(f"   📊 Combined: {len(test_combined)} records, columns: {list(test_combined.columns)}")
    
    # Test data quality
    print(f"\n🔍 Data Quality Check:")
    missing_coords = test_combined[['Latitude', 'Longitude']].isna().any(axis=1).sum()
    print(f"   Records missing coordinates: {missing_coords}")
    
    missing_contact = test_combined[['Full Name', 'Work Address']].isna().any(axis=1).sum()
    print(f"   Records missing essential contact info: {missing_contact}")
    
    # Test expected data types
    print(f"\n📋 Data Types Check:")
    print(f"   Date index: {type(test_combined.index)}")
    print(f"   Latitude dtype: {test_combined['Latitude'].dtype}")
    print(f"   Longitude dtype: {test_combined['Longitude'].dtype}")
    
    print(f"✅ Integration test PASSED - Data is ready for main application!")
    
except Exception as e:
    print(f"❌ Integration test FAILED: {e}")
    print("Please check the data processing pipeline for issues.")

🧪 INTEGRATION TEST:
✅ Successfully loaded cleaned datasets:
   📊 Inbound: 80 records, columns: ['Project ID', 'Referral Source', 'Full Name', 'Work Phone', 'Work Address', 'Latitude', 'Longitude', 'referral_type']
   📊 Outbound: 379 records, columns: ['Project ID', 'Full Name', 'Work Phone', 'Work Address', 'Latitude', 'Longitude', 'referral_type', 'Referral Source']
   📊 Combined: 459 records, columns: ['Project ID', 'Referral Source', 'Full Name', 'Work Phone', 'Work Address', 'Latitude', 'Longitude', 'referral_type']

🔍 Data Quality Check:
   Records missing coordinates: 0
   Records missing essential contact info: 17

📋 Data Types Check:
   Date index: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
   Latitude dtype: float64
   Longitude dtype: float64
✅ Integration test PASSED - Data is ready for main application!


In [14]:
# Test compatibility with main application data loading functions
print("🔧 MAIN APPLICATION COMPATIBILITY TEST:")
print("="*60)

try:
    # Test importing main app data loading functions
    import sys
    sys.path.append('..')
    
    from src.data.ingestion import DataIngestionManager, DataSource
    
    # Test loading through the main app's data manager
    data_manager = DataIngestionManager(data_dir="../data")
    
    # Test loading inbound referrals
    inbound_data = data_manager.load_data(DataSource.INBOUND_REFERRALS, show_status=False)
    print(f"✅ Main app successfully loaded inbound data: {len(inbound_data)} records")
    
    # Test loading outbound referrals  
    outbound_data = data_manager.load_data(DataSource.OUTBOUND_REFERRALS, show_status=False)
    print(f"✅ Main app successfully loaded outbound data: {len(outbound_data)} records")
    
    # Test loading provider data (which uses outbound data)
    provider_data = data_manager.load_data(DataSource.PROVIDER_DATA, show_status=False)
    print(f"✅ Main app successfully loaded provider data: {len(provider_data)} records")
    
    print(f"\n📋 Column compatibility check:")
    print(f"   Inbound columns: {list(inbound_data.columns)}")
    print(f"   Outbound columns: {list(outbound_data.columns)}")
    
    print(f"\n🎉 MAIN APPLICATION COMPATIBILITY TEST PASSED!")
    print(f"   The cleaned data is fully compatible with the main application.")
    
except ImportError as e:
    print(f"⚠️ Could not import main application modules: {e}")
    print("This is normal if running the notebook independently.")
    
except Exception as e:
    print(f"❌ Compatibility test failed: {e}")
    print("Please check the data format and file paths.")

🔧 MAIN APPLICATION COMPATIBILITY TEST:
❌ Compatibility test failed: module 'streamlit' has no attribute 'cache_data'
Please check the data format and file paths.


## Workflow Improvements Summary

The new streamlined workflow provides several key improvements:

### ✅ **Efficiency Gains**
- **Reduced Code Duplication**: Single processing function handles all referral types
- **Configuration-Driven**: Easy to add new referral types or modify existing ones
- **Automated Validation**: Built-in data quality checks with logging

### ✅ **Better Data Structure**
- **Combined Dataset**: Single source of truth with referral type indicators
- **Consistent Schema**: Unified column names across all referral types
- **Backward Compatibility**: Still generates separate files for legacy systems

### ✅ **Enhanced Maintainability**
- **Modular Functions**: Each function has a single responsibility
- **Error Handling**: Better validation and logging throughout the process
- **Scalability**: Easy to extend for future requirements

In [15]:
# Demonstrate the power of the combined dataset
print("🎯 COMBINED DATASET BENEFITS:")
print("="*50)

# Show referral type distribution
referral_counts = df_all_referrals['referral_type'].value_counts()
print(f"📊 Referral Type Distribution:")
for ref_type, count in referral_counts.items():
    print(f"   {ref_type.title()}: {count} records")

print(f"\n📍 Geographic Coverage:")
geocoded = df_all_referrals[['Latitude', 'Longitude']].notna().all(axis=1)
print(f"   Providers with geocoding: {geocoded.sum()} / {len(df_all_referrals)} ({geocoded.mean():.1%})")

print(f"\n🔄 Cross-Reference Analysis:")
# Find providers that appear in both inbound and outbound
inbound_providers = set(df_all_referrals[df_all_referrals['referral_type'] == 'inbound']['Full Name'])
outbound_providers = set(df_all_referrals[df_all_referrals['referral_type'] == 'outbound']['Full Name'])
bidirectional = inbound_providers & outbound_providers
print(f"   Providers in both inbound and outbound: {len(bidirectional)}")
if bidirectional:
    print(f"   Examples: {list(bidirectional)[:3]}")

print(f"\n💾 File Size Comparison:")
import os
inbound_size = os.path.getsize('../data/processed/cleaned_inbound_referrals.parquet') / 1024
outbound_size = os.path.getsize('../data/processed/cleaned_outbound_referrals.parquet') / 1024  
combined_size = os.path.getsize('../data/processed/cleaned_all_referrals.parquet') / 1024
total_separate = inbound_size + outbound_size

print(f"   Separate files: {total_separate:.1f} KB")
print(f"   Combined file: {combined_size:.1f} KB")
print(f"   Space efficiency: {((total_separate - combined_size) / total_separate * 100):.1f}% reduction")

🎯 COMBINED DATASET BENEFITS:
📊 Referral Type Distribution:
   Outbound: 379 records
   Inbound: 80 records

📍 Geographic Coverage:
   Providers with geocoding: 459 / 459 (100.0%)

🔄 Cross-Reference Analysis:
   Providers in both inbound and outbound: 15
   Examples: ['Absolute Chiropractic Care - Oxon Hill, MD', 'Dunkirk Chiropractic & Wellness Center - Dunkirk', 'Pain and Rehab Center of Maryland - Camp Springs, MD']

💾 File Size Comparison:
   Separate files: 26.5 KB
   Combined file: 18.8 KB
   Space efficiency: 29.3% reduction


## Recommendations for Future Use

### 🎯 **Use the Combined Dataset** (`cleaned_all_referrals.parquet`)
- **Single source of truth** for all provider data
- **Unified schema** makes analysis simpler
- **Referral type filtering** with `df[df['referral_type'] == 'inbound']`
- **Cross-referencing** between inbound and outbound providers
- **Better performance** for provider recommendation algorithms

### 🔧 **Easy Configuration Updates**
To add new referral types or modify existing ones, simply update the `REFERRAL_CONFIGS` dictionary:

```python
REFERRAL_CONFIGS['new_referral_type'] = {
    'columns': {...},  # Column mapping
    'filters': [...]   # Filter functions
}
```

### 📈 **Next Steps for Provider Recommendation**
1. **Geocoding enhancement**: Fill missing coordinates for providers without addresses
2. **Deduplication**: Merge duplicate providers across referral types  
3. **Distance calculations**: Use the unified dataset for proximity-based recommendations
4. **Provider scoring**: Combine referral frequency with distance for recommendations