In [4]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import plotly.express as px
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Database connection
DATABASE_URI = 'postgresql://woodylin@localhost:5432/healthcare_costs'
engine = create_engine(DATABASE_URI)

# Function to generate sample healthcare data
def generate_sample_data(n_records=1000):
    np.random.seed(42)
    
    # Generate dates
    start_date = pd.to_datetime('2023-01-01')
    end_date = pd.to_datetime('2023-12-31')
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    data = {
        'service_date': np.random.choice(dates, n_records),
        'patient_id': [f'P{i:05d}' for i in np.random.randint(1, 500, n_records)],
        'provider_id': [f'DR{i:03d}' for i in np.random.randint(1, 50, n_records)],
        'diagnosis_code': [f'D{i:03d}' for i in np.random.randint(1, 20, n_records)],
        'procedure_code': [f'PROC{i:03d}' for i in np.random.randint(1, 30, n_records)],
        'total_cost': np.random.lognormal(mean=5, sigma=1, size=n_records),
        'service_type': np.random.choice(['Outpatient', 'Inpatient', 'Emergency', 'Preventive'], n_records),
        'location': np.random.choice(['North', 'South', 'East', 'West', 'Central'], n_records)
    }
    
    return pd.DataFrame(data)

# Generate and clean sample data
df = generate_sample_data(5000)

# Basic data cleaning
def clean_healthcare_data(df):
    df_cleaned = df.copy()
    
    # Convert service_date to datetime
    df_cleaned['service_date'] = pd.to_datetime(df_cleaned['service_date'])
    
    # Remove records with negative or zero costs
    df_cleaned = df_cleaned[df_cleaned['total_cost'] > 0]
    
    # Round costs to 2 decimal places
    df_cleaned['total_cost'] = df_cleaned['total_cost'].round(2)
    
    # Add derived columns - using string format for month_year instead of Period
    df_cleaned['month_year'] = df_cleaned['service_date'].dt.strftime('%Y-%m')
    df_cleaned['weekday'] = df_cleaned['service_date'].dt.day_name()
    
    # Create cost categories
    df_cleaned['cost_category'] = pd.qcut(df_cleaned['total_cost'], 
                                        q=3, 
                                        labels=['Low', 'Medium', 'High'])
    
    return df_cleaned

# Clean the data
df_cleaned = clean_healthcare_data(df)

# Save to PostgreSQL
df_cleaned.to_sql('claims', engine, if_exists='replace', index=False)

# Quick visualization of cost distribution
fig = px.histogram(df_cleaned, 
                  x='total_cost', 
                  title='Distribution of Healthcare Costs',
                  nbins=50)
fig.show()

# Basic statistics
print("\nBasic Statistics:")
print(df_cleaned['total_cost'].describe())

# Save processed data for Tableau
df_cleaned.to_csv('../data/processed/cleaned_claims.csv', index=False)

# Verify the file was saved
import os
if os.path.exists('../data/processed/cleaned_claims.csv'):
    print("\nCSV file successfully saved!")
    print("File location:", os.path.abspath('../data/processed/cleaned_claims.csv'))
else:
    print("\nError: CSV file not saved")


Basic Statistics:
count    5000.000000
mean      236.546556
std       289.884528
min         4.530000
25%        75.795000
50%       146.935000
75%       281.280000
max      4095.680000
Name: total_cost, dtype: float64

CSV file successfully saved!
File location: /Users/woodylin/Documents/Healthcare-Cost-Analysis-and-Forecasting/healthcare-cost-analysis/data/processed/cleaned_claims.csv
