# 🧼 02_data_cleaning_eda.ipynb

### **Objective**
This notebook handles the data cleaning and exploratory data analysis (EDA) for the healthcare project. It takes the raw data, cleans it, and provides initial insights before formal KPI calculations.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_raw = pd.read_csv('../data/raw/healthcare_raw_data.csv')
print('Data loaded successfully.')
print('\nInitial Info:')
print(df_raw.info())
print('\nFirst 5 rows:')
print(df_raw.head())

### 2. Data Cleaning & Transformation

In [None]:
df_raw['date'] = pd.to_datetime(df_raw['date'])
print("\n'date' column converted to datetime.")

initial_rows = len(df_raw)
df_raw.drop_duplicates(inplace=True)
print(f"\nRemoved {initial_rows - len(df_raw)} duplicate rows.")

print('\nMissing values per column:')
print(df_raw.isnull().sum())

numerical_cols = ['patients_admitted', 'discharges', 'readmissions', 'avg_stay_days', 'er_wait_time', 'revenue']
for col in numerical_cols:
    if (df_raw[col] < 0).any():
        print(f"Warning: Negative values found in '{col}'.")

### 3. Exploratory Data Analysis (EDA)

In [None]:
# (Optional) EDA plots; enable in an environment with display
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.histplot(df_raw['satisfaction_rating'], kde=True, bins=20)
# plt.title('Distribution of Patient Satisfaction Ratings')
# plt.show()

monthly_admissions = df_raw.groupby(pd.Grouper(key='date', freq='M'))['patients_admitted'].sum()
print(monthly_admissions.head())

### 4. Save Cleaned Data

In [None]:
output_path = '../data/processed/healthcare_processed_data.csv'
df_raw.to_csv(output_path, index=False)
print(f"\nCleaned and processed data saved to {output_path}")
print('\nFinal DataFrame Info:')
print(df_raw.info())