In [29]:
import pandas as pd
import numpy as np

# Import Data

In [30]:
hospital_general_info_df = pd.read_csv('data/hospital_general_info.csv')

df = hospital_general_info_df

# Initial Exploration

In [31]:
from utilities import exploratory_data_analysis
exploratory_data_analysis(df)

  Facility ID                    Facility Name                     Address  \
0       10001  SOUTHEAST HEALTH MEDICAL CENTER      1108 ROSS CLARK CIRCLE   
1       10005         MARSHALL MEDICAL CENTERS  2505 U S HIGHWAY 431 NORTH   
2       10006     NORTH ALABAMA MEDICAL CENTER         1701 VETERANS DRIVE   
3       10007         MIZELL MEMORIAL HOSPITAL               702 N MAIN ST   
4       10008      CRENSHAW COMMUNITY HOSPITAL         101 HOSPITAL CIRCLE   

  City/Town State  ZIP Code County/Parish Telephone Number  \
0    DOTHAN    AL     36301       HOUSTON   (334) 793-8701   
1      BOAZ    AL     35957      MARSHALL   (256) 593-8310   
2  FLORENCE    AL     35630    LAUDERDALE   (256) 768-8400   
3       OPP    AL     36467     COVINGTON   (334) 493-3541   
4   LUVERNE    AL     36049      CRENSHAW   (334) 335-3374   

          Hospital Type                           Hospital Ownership  ...  \
0  Acute Care Hospitals  Government - Hospital District or Authority  ...   
1  A

# Cleaning the data

In [32]:
def clean_hospital_general_info(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the hospital_general_info DataFrame by removing unwanted columns and filtering out unknown races.

    Args:
        df (pd.DataFrame): The hospital general info data.

    Returns:
        pd.DataFrame: Cleaned hospital_general_info data.

    """
    # Standardize the column names
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
    print("Standardized column names:", df.columns.tolist())

    # Drop columns that are not needed
    columns_to_drop = [
        'telephone_number', 
        'meets_criteria_for_birthing_friendly_designation',
        'hospital_overall_rating_footnote',
        'mort_group_measure_count',
        'count_of_facility_mort_measures',
        'count_of_mort_measures_no_different',
        'count_of_mort_measures_worse',
        'mort_group_footnote',
        'safety_group_measure_count',
        'count_of_facility_safety_measures',
        'count_of_safety_measures_no_different',
        'count_of_safety_measures_worse',
        'safety_group_footnote',
        'readm_group_measure_count',
        'count_of_facility_readm_measures',
        'count_of_readm_measures_no_different',
        'count_of_readm_measures_worse',
        'readm_group_footnote',
        'pt_exp_group_measure_count',
        'count_of_facility_pt_exp_measures',
        'pt_exp_group_footnote',
        'te_group_measure_count',
        'count_of_facility_te_measures',
        'te_group_footnote'
    ]

    # Print remaining columns after dropping
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
    print("Remaining columns after dropping:", df.columns.tolist())

    # Convert zip_code to string (object) to preserve formatting like leading zeros
    if 'zip_code' in df.columns:
        df['zip_code'] = df['zip_code'].astype(str).str.zfill(5)

    # Use the correct column names as they appear in the DataFrame (standardized)
    cols_to_convert = [
        'hospital_overall_rating',
        'count_of_mort_measures_better',
        'count_of_safety_measures_better',
        'count_of_readm_measures_better'
    ]

    for col in cols_to_convert:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Print the data types of each column
    print("Data types of each column:")
    print(df.dtypes)

    # Print null values in each column
    print(df.isnull().sum())

    # Replace NaN values with 0
    df.fillna(0, inplace=True)
    print("Null values after filling with 0:", df.isnull().sum())
    return df

# Calling in the function to clean data

In [33]:
hospital_general_info_df = clean_hospital_general_info(hospital_general_info_df)

Standardized column names: ['facility_id', 'facility_name', 'address', 'city/town', 'state', 'zip_code', 'county/parish', 'telephone_number', 'hospital_type', 'hospital_ownership', 'emergency_services', 'meets_criteria_for_birthing_friendly_designation', 'hospital_overall_rating', 'hospital_overall_rating_footnote', 'mort_group_measure_count', 'count_of_facility_mort_measures', 'count_of_mort_measures_better', 'count_of_mort_measures_no_different', 'count_of_mort_measures_worse', 'mort_group_footnote', 'safety_group_measure_count', 'count_of_facility_safety_measures', 'count_of_safety_measures_better', 'count_of_safety_measures_no_different', 'count_of_safety_measures_worse', 'safety_group_footnote', 'readm_group_measure_count', 'count_of_facility_readm_measures', 'count_of_readm_measures_better', 'count_of_readm_measures_no_different', 'count_of_readm_measures_worse', 'readm_group_footnote', 'pt_exp_group_measure_count', 'count_of_facility_pt_exp_measures', 'pt_exp_group_footnote', 't

In [34]:
hospital_general_info_df.head()

Unnamed: 0,facility_id,facility_name,address,city/town,state,zip_code,county/parish,hospital_type,hospital_ownership,emergency_services,hospital_overall_rating,count_of_mort_measures_better,count_of_safety_measures_better,count_of_readm_measures_better
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,HOUSTON,Acute Care Hospitals,Government - Hospital District or Authority,Yes,3.0,1.0,2.0,1.0
1,10005,MARSHALL MEDICAL CENTERS,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,MARSHALL,Acute Care Hospitals,Government - Hospital District or Authority,Yes,2.0,0.0,0.0,0.0
2,10006,NORTH ALABAMA MEDICAL CENTER,1701 VETERANS DRIVE,FLORENCE,AL,35630,LAUDERDALE,Acute Care Hospitals,Proprietary,Yes,1.0,0.0,3.0,0.0
3,10007,MIZELL MEMORIAL HOSPITAL,702 N MAIN ST,OPP,AL,36467,COVINGTON,Acute Care Hospitals,Voluntary non-profit - Private,Yes,1.0,0.0,0.0,0.0
4,10008,CRENSHAW COMMUNITY HOSPITAL,101 HOSPITAL CIRCLE,LUVERNE,AL,36049,CRENSHAW,Acute Care Hospitals,Proprietary,Yes,0.0,0.0,0.0,0.0
