# 🔍 ML Data Inspection Plan
Generated 2025-04-18 02:15

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
np.random.seed(42)

## 1️⃣ Data Snapshot and Memory

In [2]:
##Flag tiny/huge datasets; watch RAM if you’ll bootstrap 100 trees
##Shape & memory (rows × cols, bytes)
##How many rows and columns?
##Are there any memory-hogging columns (e.g., long strings)?
##Confirm you’ve loaded the data correctly

# Example synthetic dataset
n = 100
df = pd.DataFrame({
    'Department': np.random.choice(['ICU', 'SURGERY', 'ORTHO', 'PEDIA', 'CARDIO'], size=n),
    'Bed_Occupancy': np.random.randint(0, 100, size=n),
    'Emergency_Admissions': np.random.randint(0, 50, size=n),
    'Weekend': np.random.choice([0, 1], size=n),
})
df['beds_required'] = ((df['Department']=='ICU') & (df['Emergency_Admissions']>10)).astype(int)
df['timestamp'] = pd.date_range(start='2023-01-01', periods=n, freq='D')
df.info(memory_usage='deep')
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Department            100 non-null    object        
 1   Bed_Occupancy         100 non-null    int64         
 2   Emergency_Admissions  100 non-null    int64         
 3   Weekend               100 non-null    int64         
 4   beds_required         100 non-null    int64         
 5   timestamp             100 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 9.3 KB


Unnamed: 0,Department,Bed_Occupancy,Emergency_Admissions,Weekend,beds_required,timestamp
0,PEDIA,27,1,1,0,2023-01-01
1,CARDIO,6,27,0,0,2023-01-02
2,ORTHO,72,22,1,0,2023-01-03
3,CARDIO,71,36,0,0,2023-01-04
4,CARDIO,11,31,1,0,2023-01-05


## 2️⃣ Target Distribution

In [3]:
##Verify you actually have the label you think you have (and only one)
##Is the target binary, multiclass, or continuous?
##Are the classes evenly distributed?

df['beds_required'].value_counts(normalize=True)

beds_required
0    0.83
1    0.17
Name: proportion, dtype: float64

## 3️⃣ Data Types Snapshot

In [4]:
##Spot categorical vs. numeric vs. datetime early; drives split logic & preprocessing

df.dtypes.value_counts()

int64             4
object            1
datetime64[ns]    1
Name: count, dtype: int64

## 4️⃣ Missing Value Heatmap

In [6]:
##Random Forest can tolerate some NaNs but too many ⇒ bias; plan imputation
##To identify columns or rows with NaNs that may affect model training or cause errors in algorithms.
##Columns with more than 30–50% missing values
##Sparse features (too few non-NaNs)

df.isna().mean().sort_values(ascending=False)

Department              0.0
Bed_Occupancy           0.0
Emergency_Admissions    0.0
Weekend                 0.0
beds_required           0.0
timestamp               0.0
dtype: float64

## 5️⃣ Categorical Column Cardinality

In [7]:
##High‑cardinality = big one‑hot blow‑ups (RF fine, but memory ☹)
##To find categorical features with very high cardinality (i.e., many unique values), which may need encoding strategies or filtering.
##Categorical columns with hundreds or thousands of categories
##IDs mistakenly treated as categories

df.select_dtypes('object').nunique().sort_values()

Department    5
dtype: int64

## 6️⃣ Basic Descriptive Statistics

In [8]:
##Catch sensor glitches (e.g., negative ages) before splits go haywire
##To summarize basic statistics for numerical and categorical features.
##Outliers (min/max way off from mean)
##Unexpected distributions (e.g., negative values where not allowed)

df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
Department,100.0,5.0,PEDIA,26.0,,,,,,,
Bed_Occupancy,100.0,,,,48.68,0.0,26.75,50.0,72.5,98.0,29.547045
Emergency_Admissions,100.0,,,,23.48,0.0,10.75,24.0,34.0,49.0,14.245666
Weekend,100.0,,,,0.52,0.0,0.0,1.0,1.0,1.0,0.502117
beds_required,100.0,,,,0.17,0.0,0.0,0.0,0.0,1.0,0.377525
timestamp,100.0,,,,2023-02-19 12:00:00,2023-01-01 00:00:00,2023-01-25 18:00:00,2023-02-19 12:00:00,2023-03-16 06:00:00,2023-04-10 00:00:00,


## 7️⃣ Class Imbalance Check

In [None]:
##Severe imbalance ⇒ OOB error misleading; may need stratified sampling
##To assess imbalance in classification problems, which can affect model performance and evaluation.
##Severe skew (e.g., 95% of one class)
##Use roc_auc, f1-score instead of just accuracy if imbalanced

y = df['beds_required']
print("Class distribution:")
print(y.value_counts(normalize=True))

## 8️⃣ Leakage Probes

In [None]:
##	Columns that perfectly predict the label (timestamps, IDs) inflate accuracy
##To find features that "cheat" by containing information about the target.
##Perfect correlation with the target
##Timestamp or ID-based leak
##Categorical values that map 1:1 with the label

# Numeric correlation
corr = df.corr(numeric_only=True)['beds_required'].abs().sort_values(ascending=False)
print("Top correlations with target:")
print(corr)

# Perfect categorical predictors
for col in df.select_dtypes('object'):
    mapping = df.groupby(col)['beds_required'].nunique()
    if (mapping == 1).all():
        print(f"{col} may be a leakage source.")

## 9️⃣ Duplicates and Redundancy

In [None]:
##Duplicate rows can overstate confidence in bagged trees
##To remove redundant rows which can skew training, overestimate model confidence, or violate assumptions.
##Rows where all columns are exactly the same
##Near-duplicate rows (optional deeper check)

df.duplicated().sum()

## 🔟 Time Order, Train-Test Split, and Drift

In [9]:
##For time‑series features, RF bootstrap breaks chronology—need CI‑aware split
##To ensure temporal consistency in your train/test split and to detect feature drift over time.
##Is the dataset sorted by time?
##Do feature distributions shift between training and test periods?

df = df.sort_values('timestamp')
train = df[df['timestamp'] < '2023-02-15']
test = df[df['timestamp'] >= '2023-02-15']

def population_stability_index(expected, actual, bins=10):
    def _bin(x, edges): return np.digitize(x, edges[:-1], right=False)
    edges = np.histogram(expected, bins=bins)[1]
    e_bins = np.bincount(_bin(expected, edges), minlength=bins)
    a_bins = np.bincount(_bin(actual, edges), minlength=bins)
    e_pct, a_pct = e_bins / e_bins.sum(), a_bins / a_bins.sum()
    psi = np.sum((a_pct - e_pct) * np.log((a_pct + 1e-6) / (e_pct + 1e-6)))
    return psi

psi = population_stability_index(train['Bed_Occupancy'], test['Bed_Occupancy'])
ks_p = ks_2samp(train['Bed_Occupancy'], test['Bed_Occupancy']).pvalue
print("PSI:", psi)
print("KS p-value:", ks_p)

PSI: 0.4036841349673753
KS p-value: 0.7946174687375178
