In [5]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Database connection
DATABASE_URI = 'postgresql://woodylin@localhost:5432/healthcare_costs'
engine = create_engine(DATABASE_URI)

def clean_medicare_data(df):
    """
    Clean and process Medicare Part D spending data
    """
    df_cleaned = df.copy()
    
    # Convert numeric columns
    numeric_columns = [col for col in df.columns if any(x in col for x in [
        'Tot_Spndng_', 'Tot_Dsg_Unts_', 'Tot_Clms_', 'Tot_Benes_',
        'Avg_Spnd_Per_Dsg_Unt_Wghtd_', 'Avg_Spnd_Per_Clm_', 'Avg_Spnd_Per_Bene_'
    ])]
    
    for col in numeric_columns:
        df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
    
    # Convert outlier flags to boolean
    outlier_columns = [col for col in df.columns if 'Outlier_Flag_' in col]
    for col in outlier_columns:
        df_cleaned[col] = df_cleaned[col].astype(bool)
    
    # Add derived metrics
    years = ['2018', '2019', '2020', '2021', '2022']
    for year in years:
        # Calculate cost per claim
        df_cleaned[f'Cost_Per_Claim_{year}'] = (
            df_cleaned[f'Tot_Spndng_{year}'] / df_cleaned[f'Tot_Clms_{year}']
        )
        
        # Calculate cost per beneficiary
        df_cleaned[f'Cost_Per_Beneficiary_{year}'] = (
            df_cleaned[f'Tot_Spndng_{year}'] / df_cleaned[f'Tot_Benes_{year}']
        )
    
    return df_cleaned

# Read the data
try:
    df = pd.read_csv('../data/processed/DSD_PTD_RY24_P04_V10_DY22_BGM.csv')
    print("Data loaded successfully!")
    print(f"Number of records: {len(df)}")
    print("\nColumns in dataset:")
    print(df.columns.tolist())
    
    # Clean the data
    df_cleaned = clean_medicare_data(df)
    
    # Save to PostgreSQL
    df_cleaned.to_sql('medicare_spending', engine, if_exists='replace', index=False)
    print("\nData saved to PostgreSQL successfully!")
    
    # Save cleaned CSV for Tableau
    df_cleaned.to_csv('../data/processed/cleaned_medicare_spending.csv', index=False)
    print("\nCleaned CSV saved for Tableau!")
    
    # Display basic statistics
    print("\nBasic statistics for 2022 spending:")
    print(df_cleaned['Tot_Spndng_2022'].describe())
    
except Exception as e:
    print(f"Error: {e}")

Error: [Errno 2] No such file or directory: '../data/processed/DSD_PTD_RY24_P04_V10_DY22_BGM.csv'
