In [1]:
import pandas as pd
import numpy as np

# Import Data

In [2]:
hospital_interoperability__data_df = pd.read_csv('data/hospital_interoperability_data.csv')

df = hospital_interoperability__data_df

# Initial Exploration

In [3]:
from utilities import exploratory_data_analysis
exploratory_data_analysis(df)

   Facility ID                    Facility Name                     Address  \
0        10001  SOUTHEAST HEALTH MEDICAL CENTER      1108 ROSS CLARK CIRCLE   
1        10005         MARSHALL MEDICAL CENTERS  2505 U S HIGHWAY 431 NORTH   
2        10006     NORTH ALABAMA MEDICAL CENTER         1701 VETERANS DRIVE   
3        10007         MIZELL MEMORIAL HOSPITAL               702 N MAIN ST   
4        10008      CRENSHAW COMMUNITY HOSPITAL         101 HOSPITAL CIRCLE   

  City/Town State  ZIP Code County/Parish Telephone Number         CEHRT ID  \
0    DOTHAN    AL     36301       HOUSTON   (334) 793-8701  0015CAN28DKT47C   
1      BOAZ    AL     35957      MARSHALL   (256) 593-8310  0015C0HAX4ESQ0D   
2  FLORENCE    AL     35630    LAUDERDALE   (256) 768-8400  0015CW76TRC3SVN   
3       OPP    AL     36467     COVINGTON   (334) 493-3541  0015CFG3Q10HY2V   
4   LUVERNE    AL     36049      CRENSHAW   (334) 335-3374  0015CFG3Q10HY2V   

  Meets criteria for promoting interoperability of

# Cleaning the data

In [4]:
def clean_hospital_interoperability(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the hospital_interoperability DataFrame by removing unwanted columns and filtering out unknown races.

    Args:
        df (pd.DataFrame): The hospital_interoperability.

    Returns:
        pd.DataFrame: Cleaned hospital_interoperability data.
    """
    #Standardize the column names
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
    #Print column names
    print("Standardized column names:", df.columns.tolist())

    # Convert ZIP Code to string (object) to preserve formatting like leading zeros
    df['zip_code'] = df['zip_code'].astype(str).str.zfill(5)
    # print zip code column dtype
    print("zip_code column dtype:", df['zip_code'].dtype)

    # Convert 'meets_criteria_for_promoting_interoperability_of_ehrs' to boolean
    df['meets_criteria_for_promoting_interoperability_of_ehrs'] = df['meets_criteria_for_promoting_interoperability_of_ehrs'].map({'Y': True, 'N': False})
    # Print the meets_criteria_for_promoting_interoperability column dtype
    print("meets_criteria_for_promoting_interoperability_of_ehrs column dtype:", df['meets_criteria_for_promoting_interoperability_of_ehrs'].dtype)

    # Convert start_date and end_date to datetime
    df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
    df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')
    # print the start_date and end_date column dtypes
    print("start_date column dtype:", df['start_date'].dtype)
    print("end_date column dtype:", df['end_date'].dtype)

    # Convert facility_id to string
    df['facility_id'] = df['facility_id'].astype(str)
    
    # Print the facility_id column dtype
    print("facility_id column dtype:", df['facility_id'].dtype)

    # Drop columns that are not needed
    columns_to_drop = ['telephone_number']

    # Print column names after dropping
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
    print(df.columns.tolist())

    return df

# Calling in the function to clean data

In [5]:
hospital_interoperability_df = clean_hospital_interoperability(hospital_interoperability__data_df)

Standardized column names: ['facility_id', 'facility_name', 'address', 'city/town', 'state', 'zip_code', 'county/parish', 'telephone_number', 'cehrt_id', 'meets_criteria_for_promoting_interoperability_of_ehrs', 'start_date', 'end_date']
zip_code column dtype: object
meets_criteria_for_promoting_interoperability_of_ehrs column dtype: object
start_date column dtype: datetime64[ns]
end_date column dtype: datetime64[ns]
facility_id column dtype: object
['facility_id', 'facility_name', 'address', 'city/town', 'state', 'zip_code', 'county/parish', 'cehrt_id', 'meets_criteria_for_promoting_interoperability_of_ehrs', 'start_date', 'end_date']


In [6]:
hospital_interoperability_df.head()  

Unnamed: 0,facility_id,facility_name,address,city/town,state,zip_code,county/parish,cehrt_id,meets_criteria_for_promoting_interoperability_of_ehrs,start_date,end_date
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,HOUSTON,0015CAN28DKT47C,True,2023-01-01,2023-12-31
1,10005,MARSHALL MEDICAL CENTERS,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,MARSHALL,0015C0HAX4ESQ0D,True,2023-01-01,2023-12-31
2,10006,NORTH ALABAMA MEDICAL CENTER,1701 VETERANS DRIVE,FLORENCE,AL,35630,LAUDERDALE,0015CW76TRC3SVN,True,2023-01-01,2023-12-31
3,10007,MIZELL MEMORIAL HOSPITAL,702 N MAIN ST,OPP,AL,36467,COVINGTON,0015CFG3Q10HY2V,True,2023-01-01,2023-12-31
4,10008,CRENSHAW COMMUNITY HOSPITAL,101 HOSPITAL CIRCLE,LUVERNE,AL,36049,CRENSHAW,0015CFG3Q10HY2V,True,2023-01-01,2023-12-31
