In [1]:
import pandas as pd

# === 1. Load Data ===
# Replace 'medical_data.csv' with your actual file path
input_file = "healthcare_dataset.csv"
output_file = "healthcare_dataset_cleaned.csv"

# Load the dataset
df = pd.read_csv(input_file)

# === 2. Basic Info ===
print("---- DATA PREVIEW ----")
print(df.head(), "\n")

# === 3. Check for Nulls ===
print("---- NULL VALUES ----")
print(df.isnull().sum(), "\n")

# === 4. Check for Duplicates ===
duplicates = df.duplicated().sum()
print(f"---- DUPLICATE ROWS ----\nTotal duplicates: {duplicates}\n")

# Remove duplicate rows if any
if duplicates > 0:
    df = df.drop_duplicates()
    print("Duplicates removed.\n")

# === 5. Convert Data Types ===
# Convert date columns to datetime
date_columns = ['Date of Admission', 'Discharge Date']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')  # invalid dates -> NaT

# Convert Billing Amount to numeric
df['Billing Amount'] = pd.to_numeric(df['Billing Amount'], errors='coerce')

# === 6. Handle Missing Values (optional cleanup) ===
# Example: fill missing text fields with 'Unknown'
text_columns = ['Name', 'Gender', 'Blood Type', 'Medical Condition', 'Doctor',
                'Hospital', 'Insurance Provider', 'Admission Type', 'Medication', 'Test Results']

for col in text_columns:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')

# Example: fill missing numeric or date fields with appropriate defaults
df['Billing Amount'] = df['Billing Amount'].fillna(0)
df['Date of Admission'] = df['Date of Admission'].fillna(pd.Timestamp("1900-01-01"))
df['Discharge Date'] = df['Discharge Date'].fillna(pd.Timestamp("1900-01-01"))

# === 7. Final Data Info ===
print("---- DATA TYPES ----")
print(df.dtypes, "\n")

print("---- CLEANED DATA PREVIEW ----")
print(df.head(), "\n")

# === 8. Save Cleaned Data ===
df.to_csv(output_file, index=False)
print(f"✅ Cleaned dataset saved as '{output_file}'")


---- DATA PREVIEW ----
            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  