In [15]:
import pandas as pd
import numpy as np

# Import Data

In [16]:
interoperability_df = pd.read_csv('data/interoperability.csv')

df = interoperability_df

# Initial Exploration

In [17]:
print (df.head())
print (df.columns)
print (df.shape)
print (df.info())
print (df.describe())
print (df.isnull().sum())

   Facility ID                    Facility Name                     Address  \
0        10001  SOUTHEAST HEALTH MEDICAL CENTER      1108 ROSS CLARK CIRCLE   
1        10005         MARSHALL MEDICAL CENTERS  2505 U S HIGHWAY 431 NORTH   
2        10006     NORTH ALABAMA MEDICAL CENTER         1701 VETERANS DRIVE   
3        10007         MIZELL MEMORIAL HOSPITAL               702 N MAIN ST   
4        10008      CRENSHAW COMMUNITY HOSPITAL         101 HOSPITAL CIRCLE   

  City/Town State  ZIP Code County/Parish Telephone Number         CEHRT ID  \
0    DOTHAN    AL     36301       HOUSTON   (334) 793-8701  0015CAN28DKT47C   
1      BOAZ    AL     35957      MARSHALL   (256) 593-8310  0015C0HAX4ESQ0D   
2  FLORENCE    AL     35630    LAUDERDALE   (256) 768-8400  0015CW76TRC3SVN   
3       OPP    AL     36467     COVINGTON   (334) 493-3541  0015CFG3Q10HY2V   
4   LUVERNE    AL     36049      CRENSHAW   (334) 335-3374  0015CFG3Q10HY2V   

  Meets criteria for promoting interoperability of

# Cleaning the data

In [18]:
# Standardize column names
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')

In [19]:
def clean_interoperability(df):
    """
    Cleans the hospital interoperability data by standardizing column names,
    dropping unnecessary columns, converting data types, and handling missing values.
    """
 
    # Drop unnecessary columns
    cols_to_drop = ['address', 'city/town', 'state', 'zip_code', 'county/parish', 'telephone_number', 'start_date', 'end_date']

    df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

        
    #fill NaN or missing values in 'meets_criteria_for_promoting_interoperability_of_ehrs' with No
    if 'meets_criteria_for_promoting_interoperability_of_ehrs' in df.columns:
        df['meets_criteria_for_promoting_interoperability_of_ehrs'].fillna('No', inplace=True)
    # Convert 'meets_criteria_for_promoting_interoperability_of_ehrs' to categorical
    if 'meets_criteria_for_promoting_interoperability_of_ehrs' in df.columns:
        df['meets_criteria_for_promoting_interoperability_of_ehrs'] = df['meets_criteria_for_promoting_interoperability_of_ehrs'].astype('category')
        
   
    #convert zip_code to string
    if 'zip_code' in df.columns:
        df['zip_code'] = df['zip_code'].astype(str)
        
    return df
    

In [20]:
print(clean_interoperability(df).head())
cleaned_df = clean_interoperability(df)

   facility_id                    facility_name         cehrt_id  \
0        10001  SOUTHEAST HEALTH MEDICAL CENTER  0015CAN28DKT47C   
1        10005         MARSHALL MEDICAL CENTERS  0015C0HAX4ESQ0D   
2        10006     NORTH ALABAMA MEDICAL CENTER  0015CW76TRC3SVN   
3        10007         MIZELL MEMORIAL HOSPITAL  0015CFG3Q10HY2V   
4        10008      CRENSHAW COMMUNITY HOSPITAL  0015CFG3Q10HY2V   

  meets_criteria_for_promoting_interoperability_of_ehrs  
0                                                  Y     
1                                                  Y     
2                                                  Y     
3                                                  Y     
4                                                  Y     


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['meets_criteria_for_promoting_interoperability_of_ehrs'].fillna('No', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['meets_criteria_for_promoting_interoperability_of_ehrs'].fillna('No', inplace=True)


In [21]:
#save the cleaned DataFrame to a new CSV file
cleaned_df.to_csv('data/interoperability_cleaned.csv', index=False)
