In [None]:
import pandas as pd
import numpy as np
import sys

# retrieving utils from parent directory
sys.path.append('..')
from utils.utils import *  # Import all functions from utils.utils

#### Loading the dataset
---

In [None]:
csv_path = '../data/hospital_interoperability_data.csv'

df = pd.read_csv(csv_path)


#### Initial Exploratory Data Analysis (EDA)
---

In [None]:
from utils.utils import basic_eda
# Perform basic EDA
basic_eda(df)

First 5 rows:
   Facility ID                    Facility Name                     Address  \
0        10001  SOUTHEAST HEALTH MEDICAL CENTER      1108 ROSS CLARK CIRCLE   
1        10005         MARSHALL MEDICAL CENTERS  2505 U S HIGHWAY 431 NORTH   
2        10006     NORTH ALABAMA MEDICAL CENTER         1701 VETERANS DRIVE   
3        10007         MIZELL MEMORIAL HOSPITAL               702 N MAIN ST   
4        10008      CRENSHAW COMMUNITY HOSPITAL         101 HOSPITAL CIRCLE   

  City/Town State  ZIP Code County/Parish Telephone Number         CEHRT ID  \
0    DOTHAN    AL     36301       HOUSTON   (334) 793-8701  0015CAN28DKT47C   
1      BOAZ    AL     35957      MARSHALL   (256) 593-8310  0015C0HAX4ESQ0D   
2  FLORENCE    AL     35630    LAUDERDALE   (256) 768-8400  0015CW76TRC3SVN   
3       OPP    AL     36467     COVINGTON   (334) 493-3541  0015CFG3Q10HY2V   
4   LUVERNE    AL     36049      CRENSHAW   (334) 335-3374  0015CFG3Q10HY2V   

  Meets criteria for promoting inter

#### Cleaning the data
---

In [None]:
#standardize the column names
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
df.columns = df.columns.str.replace('/', '_')
print("Standardized column names:", df.columns.tolist())

Standardized column names: ['facility_id', 'facility_name', 'address', 'city_town', 'state', 'zip_code', 'county_parish', 'telephone_number', 'cehrt_id', 'meets_criteria_for_promoting_interoperability_of_ehrs', 'start_date', 'end_date']


In [None]:
# drop columns that are not needed
columns_to_drop = ['facility_name','telephone_number', 'county_parish', 'address', 'city_town', 'state', 'zip_code',]
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
print("Remaining columns after dropping:", df.columns.tolist())

Remaining columns after dropping: ['facility_id', 'cehrt_id', 'meets_criteria_for_promoting_interoperability_of_ehrs', 'start_date', 'end_date']


In [None]:
#convert dtype for 'facility_id' to string
# This is necessary to ensure that the merge with hospital_general_info works correctly
df['facility_id'] = df['facility_id'].astype(str)

#convert dtype for 'meets_criteria_for_promoting_interoperability_of_ehrs' string forcing blank values to NaN
df['meets_criteria_for_promoting_interoperability_of_ehrs'] = df['meets_criteria_for_promoting_interoperability_of_ehrs'].replace('', np.nan)

print(df.head(20))
print("NaN values in each column:")
print(df.isnull().sum())

   facility_id         cehrt_id  \
0        10001  0015CAN28DKT47C   
1        10005  0015C0HAX4ESQ0D   
2        10006  0015CW76TRC3SVN   
3        10007  0015CFG3Q10HY2V   
4        10008  0015CFG3Q10HY2V   
5        10011  0015C5LFV335W0K   
6        10012  0015C8CHE73ABS5   
7        10016  0015CEN2HB2EXA1   
8        10018  0015CN0ZLY04L17   
9        10019              NaN   
10       10021  0015CFG3Q10HY2V   
11       10022  0015CJ4T57TWR1R   
12       10023  0015CEZ6FY4WR6V   
13       10024  0015CG91R3XQ5T3   
14       10029  0015C1HRS47CWV4   
15       10033  0015CA3TZV3LRM4   
16       10034  0015CFG3Q10HY2V   
17       10035  0015CFG3Q10HY2V   
18       10036  0015CC4NC7PMU4W   
19       10039  0015CV1DWP8EW15   

   meets_criteria_for_promoting_interoperability_of_ehrs start_date  \
0                                                   Y      1/1/2023   
1                                                   Y      1/1/2023   
2                                                  

### Handling Missing Values
---
##### There are many blank values in the 'cehrt_id' and 'meets_interoperability_criteria' columns. The hospital general Information dataset indicated that several hospitals do not participate in the EHR Incentive Program. Since I am merging on Hospital General Information facility_id column, I will address these blank values after merging as it is likely that many were already removed.

#### Checking for Duplicates
---

In [None]:
# print dataset for duplicate
print("Shape of the dataset:", df.shape)
# Check for duplicates in 'facility_id'
facility_duplicates = df['facility_id'].duplicated().sum()
print(f"Total duplicate facility_id: {facility_duplicates}")

Shape of the dataset: (4593, 5)
Total duplicate facility_id: 0


In [None]:
# Set 'facility_id' as the index for the DataFrame
# This allows for faster lookups and operations based on 'facility_id'
df_indexed = df.set_index('facility_id')
print("Data indexed by 'facility_id':\n", df_indexed.head())

Data indexed by 'facility_id':
                     cehrt_id  \
facility_id                    
10001        0015CAN28DKT47C   
10005        0015C0HAX4ESQ0D   
10006        0015CW76TRC3SVN   
10007        0015CFG3Q10HY2V   
10008        0015CFG3Q10HY2V   

            meets_criteria_for_promoting_interoperability_of_ehrs start_date  \
facility_id                                                                    
10001                                                        Y      1/1/2023   
10005                                                        Y      1/1/2023   
10006                                                        Y      1/1/2023   
10007                                                        Y      1/1/2023   
10008                                                        Y      1/1/2023   

               end_date  
facility_id              
10001        12/31/2023  
10005        12/31/2023  
10006        12/31/2023  
10007        12/31/2023  
10008        12/31/2023  


In [None]:
df

Unnamed: 0,facility_id,cehrt_id,meets_criteria_for_promoting_interoperability_of_ehrs,start_date,end_date
0,10001,0015CAN28DKT47C,Y,1/1/2023,12/31/2023
1,10005,0015C0HAX4ESQ0D,Y,1/1/2023,12/31/2023
2,10006,0015CW76TRC3SVN,Y,1/1/2023,12/31/2023
3,10007,0015CFG3Q10HY2V,Y,1/1/2023,12/31/2023
4,10008,0015CFG3Q10HY2V,Y,1/1/2023,12/31/2023
...,...,...,...,...,...
4588,670322,,,1/1/2023,12/31/2023
4589,670326,,,1/1/2023,12/31/2023
4590,670327,,,1/1/2023,12/31/2023
4591,671300,0015C2XBM59VCV5,,1/1/2023,12/31/2023


#### Converting to Sqlite Database & Creating Table
---

In [None]:
# Connect to existing SQLite database
import sqlite3
conn = sqlite3.connect("../data/ehr_data.sqlite")

# Write the DataFrame `df` to a new table in the SQLite database and name it "interoperability_data"

df.to_sql(
    name="interoperability_data",  # table name
    con=conn,                  # connection to the SQLite database
    if_exists="replace",       # drop the table if it exists and create a new one
    index=False                # do not write DataFrame index as a column
)

# Close the connection
conn.close()

print("Data successfully written to SQLite database 'ehr_data.sqlite' in table 'interoperability_data'.")

Data successfully written to SQLite database 'ehr_data.sqlite' in table 'interoperability_data'.


#### Saving the cleaned dataset to CSV
---

In [None]:
import os

# Create clean_data directory if it doesn't exist
clean_data_dir = '../data/clean_data'
os.makedirs(clean_data_dir, exist_ok=True)

# Save the cleaned DataFrame to a new CSV file
clean_csv_path = os.path.join(clean_data_dir, 'cleaned_hospital_interoperability.csv')
df.to_csv(clean_csv_path, index=False)

#