In [7]:
import pandas as pd 
import numpy as np

#### Loading the dataset
---

In [8]:
csv_path = '../data/cehrt.csv'
df = pd.read_csv(csv_path)

  df = pd.read_csv(csv_path)


#### Initial Exploratory Data Analysis (EDA)
---

In [9]:
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Data types:\n", df.dtypes)
print("Missing values:\n", df.isnull().sum())
print("First 5 rows:\n", df.head())

Shape: (33008, 16)
Columns: ['Facility.ID', 'Facility.Name', 'Address', 'City.Town', 'State', 'ZIP.Code', 'County.Parish', 'Telephone.Number', 'Meets.criteria.for.promoting.interoperability.of.EHRs', 'Start.Date', 'End.Date', 'CEHRT.ID', 'chpl_id', 'product_database_id', 'developer_name', 'product_name']
Data types:
 Facility.ID                                                int64
Facility.Name                                             object
Address                                                   object
City.Town                                                 object
State                                                     object
ZIP.Code                                                   int64
County.Parish                                             object
Telephone.Number                                          object
Meets.criteria.for.promoting.interoperability.of.EHRs     object
Start.Date                                                object
End.Date                       

#### Cleaning the data
---

In [10]:
#standardize the column names to lower case and replace spaces, dashes, slashes, and dots with underscores to avoid issues in merging and analysis
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
df.columns = df.columns.str.replace('/', '_')
df.columns = df.columns.str.replace('.', '_')
print("Standardized column names:", df.columns.tolist())

Standardized column names: ['facility_id', 'facility_name', 'address', 'city_town', 'state', 'zip_code', 'county_parish', 'telephone_number', 'meets_criteria_for_promoting_interoperability_of_ehrs', 'start_date', 'end_date', 'cehrt_id', 'chpl_id', 'product_database_id', 'developer_name', 'product_name']


In [11]:
# drop columns that are not needed
columns_to_drop = ['facility_name', 'address', 'city_town', 'state', 'zip_code', 'county_parish', 'telephone_number', 'meets_criteria_for_promoting_interoperability_of_ehrs', 'start_date', 'end_date', 'chpl_id', 'product_name', 'product_database_id']
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
print("Remaining columns after dropping:", df.columns.tolist())

Remaining columns after dropping: ['facility_id', 'cehrt_id', 'developer_name']


In [12]:
#convert 'facility_id', 'cehrt_id', 'developer_name' to string to ensure consistency
df['facility_id'] = df['facility_id'].astype(str)
df['cehrt_id'] = df['cehrt_id'].astype(str)
df['developer_name'] = df['developer_name'].astype(str)
# verify conversion by checking for missing and unique values in each column
print("Missing values in 'facility_id':", df['facility_id'].isnull().sum())
print("Unique values in 'facility_id':", df['facility_id'].nunique())
print("Missing values in 'cehrt_id':", df['cehrt_id'].isnull().sum())
print("Unique values in 'cehrt_id':", df['cehrt_id'].nunique())
print("Missing values in 'developer_name':", df['developer_name'].isnull().sum())
print("Unique values in 'developer_name':", df['developer_name'].nunique())

Missing values in 'facility_id': 0
Unique values in 'facility_id': 4593
Missing values in 'cehrt_id': 0
Unique values in 'cehrt_id': 876
Missing values in 'developer_name': 0
Unique values in 'developer_name': 90


#### Checking for duplicates
---

In [13]:
facility_duplicates = df['facility_id'].duplicated().sum()
print(f"Total duplicate facility_id: {facility_duplicates}")

if facility_duplicates > 0:
    df = df.groupby('facility_id').first().reset_index()
    print("Merged duplicate facility_id rows. New shape:", df.shape)
    
# verifying merged duplicates
print("Duplicates in 'facility_id':", df['facility_id'].duplicated().sum())

Total duplicate facility_id: 28415
Merged duplicate facility_id rows. New shape: (4593, 3)
Duplicates in 'facility_id': 0


In [14]:
import os

# Create clean_data directory if it doesn't exist
clean_data_dir = '../data/clean_data'
os.makedirs(clean_data_dir, exist_ok=True)

# Save the cleaned DataFrame to a new CSV file
clean_csv_path = os.path.join(clean_data_dir, 'cleaned_cehrt.csv')
df.to_csv(clean_csv_path, index=False)