# 05: Data Cleaning & Exploratory Data Analysis
## Healthcare Resource Optimization Project

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_processing.cleaning import DataCleaner
from src.data_processing.data_validation import DataValidator
from src.analysis.eda_functions import EDAAnalyzer

sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Raw Data

In [None]:
# Load NHAMCS data
nhamcs_df = pd.read_csv('../data/raw/nhamcs/nhamcs_2021.csv')

print(f"Dataset shape: {nhamcs_df.shape}")
print(f"\nFirst few rows:")
nhamcs_df.head()

## 2. Data Validation

In [None]:
validator = DataValidator()

# Generate validation report
report = validator.generate_full_report(
    nhamcs_df,
    date_cols=['VDATE']
)

validator.print_report()

## 3. Data Cleaning

In [None]:
cleaner = DataCleaner()

# Clean NHAMCS data
nhamcs_clean = cleaner.clean_nhamcs_data(nhamcs_df)

print(f"\nCleaned data shape: {nhamcs_clean.shape}")
print(f"\nNew columns created:")
new_cols = set(nhamcs_clean.columns) - set(nhamcs_df.columns)
print(new_cols)

## 4. Exploratory Data Analysis

In [None]:
eda = EDAAnalyzer()

# Summary statistics
summary = eda.get_summary_stats(nhamcs_clean)
print("Summary Statistics:")
summary.head(10)

In [None]:
# Distribution analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Age distribution
if 'age_group' in nhamcs_clean.columns:
    nhamcs_clean['age_group'].value_counts().plot(kind='bar', ax=axes[0,0])
    axes[0,0].set_title('ER Visits by Age Group')
    axes[0,0].set_xlabel('Age Group')
    axes[0,0].set_ylabel('Count')

# Time of day distribution
if 'time_of_day' in nhamcs_clean.columns:
    nhamcs_clean['time_of_day'].value_counts().plot(kind='bar', ax=axes[0,1])
    axes[0,1].set_title('ER Visits by Time of Day')
    axes[0,1].set_xlabel('Time of Day')

# Day of week
if 'day_of_week' in nhamcs_clean.columns:
    day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    day_counts = nhamcs_clean['day_of_week'].value_counts().sort_index()
    axes[1,0].bar(range(7), day_counts.values)
    axes[1,0].set_xticks(range(7))
    axes[1,0].set_xticklabels(day_names)
    axes[1,0].set_title('ER Visits by Day of Week')

# High acuity distribution
if 'high_acuity' in nhamcs_clean.columns:
    nhamcs_clean['high_acuity'].value_counts().plot(kind='pie', ax=axes[1,1], autopct='%1.1f%%')
    axes[1,1].set_title('High vs Low Acuity Visits')

plt.tight_layout()
plt.savefig('../visualizations/eda/distributions.png', dpi=300)
plt.show()

## 5. Correlation Analysis

In [None]:
# Find strong correlations
strong_corr = eda.find_correlations(nhamcs_clean, threshold=0.3)

if not strong_corr.empty:
    print("Strong Correlations Found:")
    print(strong_corr)
else:
    print("No strong correlations found above threshold")

## 6. Save Cleaned Data

In [None]:
# Save cleaned dataset
nhamcs_clean.to_csv('../data/processed/cleaned_nhamcs.csv', index=False)
print("âœ“ Cleaned data saved to data/processed/cleaned_nhamcs.csv")