In [15]:
import pandas as pd
import numpy as np

# Import Data

In [16]:
general_df = pd.read_csv('data/general.csv')

df = general_df

# Initial Exploration

In [17]:
print (df.head())
print (df.columns)
print (df.shape)
print (df.info())
print (df.describe())
print (df.isnull().sum())


  Facility ID                    Facility Name                     Address  \
0       10001  SOUTHEAST HEALTH MEDICAL CENTER      1108 ROSS CLARK CIRCLE   
1       10005         MARSHALL MEDICAL CENTERS  2505 U S HIGHWAY 431 NORTH   
2       10006     NORTH ALABAMA MEDICAL CENTER         1701 VETERANS DRIVE   
3       10007         MIZELL MEMORIAL HOSPITAL               702 N MAIN ST   
4       10008      CRENSHAW COMMUNITY HOSPITAL         101 HOSPITAL CIRCLE   

  City/Town State  ZIP Code County/Parish Telephone Number  \
0    DOTHAN    AL     36301       HOUSTON   (334) 793-8701   
1      BOAZ    AL     35957      MARSHALL   (256) 593-8310   
2  FLORENCE    AL     35630    LAUDERDALE   (256) 768-8400   
3       OPP    AL     36467     COVINGTON   (334) 493-3541   
4   LUVERNE    AL     36049      CRENSHAW   (334) 335-3374   

          Hospital Type                           Hospital Ownership  ...  \
0  Acute Care Hospitals  Government - Hospital District or Authority  ...   
1  A

# Cleaning the data

In [18]:
def clean_general(df):
    """
    Cleans the hospital general info data by standardizing column names,
    dropping unnecessary columns, converting data types, and handling missing values.
    Args:
        df (pd.DataFrame): The hospital general info data.
    Returns:
        pd.DataFrame: Cleaned hospital_general_info data.
    """
    #standardize the column names
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')

    #drop columns that are not needed
    columns_to_drop = [
        'telephone_number', 
        'meets_criteria_for_birthing_friendly_designation',
        'hospital_overall_rating_footnote',
        'mort_group_measure_count',
        'count_of_facility_mort_measures',
        'count_of_mort_measures_no_different',
        'count_of_mort_measures_worse',
        'mort_group_footnote',
        'safety_group_measure_count',
        'count_of_facility_safety_measures',
        'count_of_safety_measures_no_different',
        'count_of_safety_measures_worse',
        'safety_group_footnote',
        'readm_group_measure_count',
        'count_of_facility_readm_measures',
        'count_of_readm_measures_no_different',
        'count_of_readm_measures_worse',
        'readm_group_footnote',
        'pt_exp_group_measure_count',
        'count_of_facility_pt_exp_measures',
        'pt_exp_group_footnote',
        'te_group_measure_count',
        'count_of_facility_te_measures',
        'te_group_footnote', 
         'city/town', 
         'county/parish',
         'hospital_type',
         'emergency_services',
         
    ]
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    #convert zip_code to string (object) to preserve formatting like leading zeros
    if 'zip_code' in df.columns:
        df['zip_code'] = df['zip_code'].astype(str).str.zfill(5)

    #use the correct column names as they appear in the DataFrame (standardized)
    cols_to_convert = [
        'hospital_overall_rating',
        'count_of_mort_measures_better',
        'count_of_safety_measures_better',
        'count_of_readm_measures_better'
    ]
    
    for col in cols_to_convert:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    #replace NaN values with 0 for hospital_overall_rating, 'count_of_mort_measures_better',
    # 'count_of_safety_measures_better', and 'count_of_readm_measures_better'
    df['hospital_overall_rating'].fillna(0, inplace=True)
    df['count_of_mort_measures_better'].fillna(0, inplace=True)
    df['count_of_safety_measures_better'].fillna(0, inplace=True)
    df['count_of_readm_measures_better'].fillna(0, inplace=True)
    return df

In [19]:
#print clean_general(df).head()
cleaned_df = clean_general(df)
print(cleaned_df.head())



  facility_id                    facility_name                     address  \
0       10001  SOUTHEAST HEALTH MEDICAL CENTER      1108 ROSS CLARK CIRCLE   
1       10005         MARSHALL MEDICAL CENTERS  2505 U S HIGHWAY 431 NORTH   
2       10006     NORTH ALABAMA MEDICAL CENTER         1701 VETERANS DRIVE   
3       10007         MIZELL MEMORIAL HOSPITAL               702 N MAIN ST   
4       10008      CRENSHAW COMMUNITY HOSPITAL         101 HOSPITAL CIRCLE   

  state zip_code                           hospital_ownership  \
0    AL    36301  Government - Hospital District or Authority   
1    AL    35957  Government - Hospital District or Authority   
2    AL    35630                                  Proprietary   
3    AL    36467               Voluntary non-profit - Private   
4    AL    36049                                  Proprietary   

   hospital_overall_rating  count_of_mort_measures_better  \
0                      3.0                            1.0   
1                 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['hospital_overall_rating'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['count_of_mort_measures_better'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

In [20]:
# Check for duplicates
duplicates = df.duplicated()
if duplicates.any():
    print("Duplicates found:")
    print(df[duplicates])
else:
    print("No duplicates found.")

No duplicates found.


In [21]:
# save the cleaned DataFrame to a new CSV file
cleaned_df.to_csv('data/general_cleaned.csv', index=False)