## Python Data Cleaning

### Load The Dataset

In [29]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(r"F:\Lerning\Case Studies\Healthcare\0_Planning\Hospital_Operations.csv")

In [4]:
df.head()

Unnamed: 0,Patient_ID,Admission_Date,Discharge_Date,Department,Doctor_ID,Doctor_Name,Diagnosis,Treatment_Type,Treatment_Cost,Insurance_Type,Satisfaction_Score,Readmission_30Days,Bed_ID,Region,Wait_Time_Minutes,Doctor_Experience_Years,Age,Gender,Total_Revenue
0,P1000,2024-08-03,2024-08-07,Cardiology,D053,Dr. Yara Hamed,Migraine,Inpatient,1559.81,self pay,7.0,y,B44,North,79.0,6,2,M,2214.44
1,P1001,2024-04-30,2024-04-28,Cardiology,D062,Dr. Omar Saeed,Diabetes,OUTPATIENT,2193.57,Public,4.0,y,B03,West,174.0,5,51,male,2119.59
2,P1002,2024-11-22,2024-11-26,ORTHO,D104,Dr. Omar Saeed,Fracture,inpatient,1519.8,privte,8.0,Yes,B06,South,176.0,22,4,M,2304.03
3,P1003,2024-02-14,2024-02-23,pediatric,D026,Dr Rana Elbaz,Migraine,Emergency,2590.77,self pay,3.0,No,B45,North,,9,29,F,3142.18
4,P1004,2024-04-10,2024-04-22,Orthopedics,D041,dr. omar said,Migraine,Emergency,499.46,self pay,1.0,y,B09,North,140.0,8,12,male,589.42


Duplicates

In [6]:
df.duplicated().sum()

0

Standardize text

In [24]:
df['Department'] = df['Department'].str.strip().str.title()
df['Doctor_Name'] = df['Doctor_Name'].str.strip().str.title()
df['Insurance_Type'] = df['Insurance_Type'].str.strip().str.title()

Typos

In [25]:

df['Insurance_Type'] = df['Insurance_Type'].replace({
    'Privte': 'Private',
    'Self Pay': 'Self-Pay',
    'Public ': 'Public',
    'Publick': 'Public'
})


Standardize readmission column

In [26]:
df['Readmission_30Days'] = df['Readmission_30Days'].astype(str).str.lower().str.strip()
df['Readmission_30Days'] = df['Readmission_30Days'].replace({
    'yes': 'Y', 'y': 'Y', 'no': 'N', 'n': 'N'
})

Standardize Gender

In [27]:
df['Gender'] = df['Gender'].str.strip().str.title().replace({
    'Male':'M', 'Female' :'F'
})

Remove invalid date rows (where discharge < admission)

In [28]:
invalid_dates = df['Discharge_Date'] < df['Admission_Date']

In [32]:
df.loc[invalid_dates, ['Admission_Date', 'Discharge_Date']] = np.nan 

Fill Wait Time missing with department median 

In [33]:
df['Wait_Time_Minutes'] = df.groupby('Department')['Wait_Time_Minutes'].transform(
    lambda x: x.fillna(x.median())
)

Fill Satisfaction Score missing with department mean

In [34]:
df['Satisfaction_Score'] = df.groupby('Department')['Satisfaction_Score'].transform(
    lambda x: x.fillna(round(x.median()))
)

Derived columns

In [35]:
df['Length_of_Stay'] = (df['Discharge_Date'] - df['Admission_Date']).dt.days

In [36]:
df['Profit'] = df['Total_Revenue'] - df['Treatment_Cost']

Remove rows with invalid or missing lengh of stay

In [37]:
df = df[df['Length_of_Stay'] .notnull() & (df['Length_of_Stay'] >= 0)]

Fill remaining numiric columns with department mean

In [38]:
numeric_cols = df.select_dtypes(include=[np.number]).columns

In [39]:
for col in numeric_cols:
    df[col] = df.groupby('Department')[col].transform(
        lambda x: x.fillna(x.mean())
    )

Fill remaining categorical columns with mode

In [40]:
categorical_cols = df.select_dtypes(include=['object']).columns

In [41]:
for col in categorical_cols:
    df[col] = df.groupby('Department')[col].transform(
        lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'Unknown')
    )

Export Cleaned Data

In [42]:
df.to_csv("Hospital_Operations_Cleaned.csv", index=False)