# Diabetic Data Cleaning Notebook
This notebook performs Phase 1 of the data cleaning process for the diabetic dataset. It loads the raw data, handles missing values, removes expired patients, merges ID descriptions, and validates the final cleaned dataset.

In [None]:
import pandas as pd
import numpy as np
import os

## 1️⃣ Load the raw diabetic data

In [None]:
# File paths (adjust if needed)
RAW_DATA_PATH = '../data/raw/diabetic_data.csv'
ID_MAP_PATH = '../data/raw/IDs_mapping.csv'
OUTPUT_PATH = '../data/processed/diabetic_data_clean.csv'

# Load raw data, converting '?' to NaN
df = pd.read_csv(RAW_DATA_PATH, na_values=['?'])
df.head()

## 2️⃣ Drop `weight` column due to excessive missing values

In [None]:
if df['weight'].isna().mean() > 0.9:
    df = df.drop(columns=['weight'])
    print('Dropped weight column due to >90% missing values')

## 3️⃣ Remove expired patients

In [None]:
expired_ids = [11,19,20,21]  # Expired discharge_disposition_id
df = df[~df['discharge_disposition_id'].isin(expired_ids)]
print(f'Remaining rows after removing expired patients: {len(df)}')

## 4️⃣ Load and merge ID mappings

In [None]:
# ---------------- 4️⃣ Load and merge ID mappings ---------------- #

# Function to parse the IDs_mapping.csv into separate DataFrames
def load_mapping_sections(mapping_csv_path):
    """
    Parse IDs_mapping.csv which contains multiple mapping tables
    separated by header rows and blank lines.
    Returns three DataFrames: admission_type, discharge_disposition, admission_source
    """
    sections = {
        "admission_type_id": [],
        "discharge_disposition_id": [],
        "admission_source_id": []
    }

    current_section = None

    with open(mapping_csv_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            # Detect section headers
            if line.startswith("admission_type_id"):
                current_section = "admission_type_id"
                continue
            elif line.startswith("discharge_disposition_id"):
                current_section = "discharge_disposition_id"
                continue
            elif line.startswith("admission_source_id"):
                current_section = "admission_source_id"
                continue

            # Skip malformed lines
            if "," not in line or current_section is None:
                continue

            id_val, desc = line.split(",", 1)
            if id_val.isdigit():
                sections[current_section].append(
                    {"id": int(id_val), "description": desc.strip('"')}
                )

    # Convert each section to DataFrame
    admission_type_df = pd.DataFrame(sections["admission_type_id"])
    discharge_df = pd.DataFrame(sections["discharge_disposition_id"])
    admission_source_df = pd.DataFrame(sections["admission_source_id"])

    return admission_type_df, discharge_df, admission_source_df



In [None]:
# Load mapping CSV
admission_type_map, discharge_map, admission_source_map = load_mapping_sections(ID_MAP_PATH)


### Note: The mapping CSV contains multiple sections. For simplicity, we'll create separate mappings manually based on the earlier CSV content.

In [None]:
# Function to map numeric IDs to descriptions
def merge_id_descriptions_safe(df, mapping_df, id_col, new_col_name):
    """
    Maps numeric IDs to descriptions safely without creating extra columns.
    """
    mapping_dict = mapping_df.set_index('id')['description'].to_dict()
    df[new_col_name] = df[id_col].map(mapping_dict)
    print(f"Mapped '{id_col}' to '{new_col_name}'")
    return df

# Merge all three mappings into the main DataFrame
df = merge_id_descriptions_safe(df, admission_type_map, 'admission_type_id', 'admission_type_desc')
df = merge_id_descriptions_safe(df, discharge_map, 'discharge_disposition_id', 'discharge_desc')
df = merge_id_descriptions_safe(df, admission_source_map, 'admission_source_id', 'admission_source_desc')

# Check results
df.head()


## 5️⃣ Remove duplicate rows

In [None]:
initial_count = len(df)
df = df.drop_duplicates()
print(f'Removed {initial_count - len(df)} duplicate rows.')

## 6️⃣ Validate cleaned dataset

In [None]:
df.info()
df.describe(include='all')

## 7️⃣ Save cleaned dataset

In [None]:
df.to_csv(OUTPUT_PATH, index=False)
print(f'Saved cleaned dataset to {OUTPUT_PATH}')

## Cleaning Decisions Summary:
- **Missing values**: Replaced '?' with NaN for proper handling.
- **Weight column**: Dropped due to >90% missing values.
- **Expired patients**: Removed patients with `discharge_disposition_id` 11.
- **ID mappings**: Merged human-readable descriptions for admission type, discharge disposition, and admission source.
- **Duplicates**: Removed duplicate rows to ensure clean data.
- **Validation**: Dataset info and descriptive stats checked post-cleaning.