In [9]:
import pandas as pd
import numpy as np

# Load the data
df_raw = pd.read_csv('../data/diabetic_data.csv')

# Dataset shape
print(f"Dataset shape: {df_raw.shape}")

df_raw.head()

# Copy raw data for cleaning
df_clean = df_raw.copy()

# Create binary target
df_clean['readmit_30'] = (df_clean['readmitted'] == '<30').astype(int)

# Sanity check class balance
df_clean['readmit_30'].value_counts(normalize=True).rename({1:'<30', 0:'not <30'}) * 100

# Replace placeholder missing values
df_clean = df_clean.replace('?', np.nan)

# Check missing values
missing_counts = df_clean.isna().sum().sort_values(ascending=False)
missing_counts[missing_counts > 0]

# Drop columns with too many missing values
cols_to_drop = ['weight', 'max_glu_serum', 'payer_code']
df_clean = df_clean.drop(columns=[c for c in cols_to_drop if c in df_clean.columns])

df_clean.shape

fill_unknown = ['medical_specialty', 'race', 'diag_1', 'diag_2', 'diag_3']
for col in fill_unknown:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna('Unknown')

if 'A1Cresult' in df_clean.columns:
    df_clean['A1Cresult'] = df_clean['A1Cresult'].fillna('Not_measured')

# Verify no remaining missing values
df_clean.isna().sum().sort_values(ascending=False).head(10)


df_clean.to_csv("../data/diabetic_data_clean_phase2.csv", index=False)






Dataset shape: (101766, 50)
