**Different Types of Missing Values**

1. MCAR - Missing Completely At Random
2. MAR - Missing At Random
3. MNAR - Missing Not At Random

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer

In [None]:
# 1. Update the base path to match Kaggle's input structure exactly
base_path = '/kaggle/input/different-types-of-missing-data/'

# 2. Load the files using the base path
df_mcar = pd.read_csv(base_path + 'dataset_mcar.csv')
df_mar  = pd.read_csv(base_path + 'dataset_mar.csv')
df_mnar = pd.read_csv(base_path + 'dataset_mnar.csv')

# 3. Now check the sums
print("MCAR Missing Count:", df_mcar.isnull().sum().sum())
print("MAR Missing Count:", df_mar.isnull().sum().sum())
print("MNAR Missing Count:", df_mnar.isnull().sum().sum())

In [None]:
# 1. HANDLING MCAR (Missing Completely At Random)
# Dataset: dataset_mcar.csv
# Strategy: Mean/Median Imputation
# Why: Since data is missing randomly, the average of the remaining data 
# is a statistically unbiased estimate of the missing values.

df_mcar = pd.read_csv('/kaggle/input/different-types-of-missing-data/dataset_mcar.csv')

# Initialize SimpleImputer with Mean
imputer_mcar = SimpleImputer(strategy='mean')

# Fit and Transform 'Temperature'
# Note: Sklearn requires 2D array, hence double brackets [['Temperature']]
df_mcar['Temperature'] = imputer_mcar.fit_transform(df_mcar[['Temperature']])

print("MCAR (Temperature) handled with Mean Imputation.")
print(df_mcar.head())
print("\n")

In [None]:
# 2. HANDLING MAR (Missing At Random)
# Dataset: dataset_mar.csv
# Strategy: Multivariate Imputation (MICE / IterativeImputer)
# Why: Missingness in BP depends on Age. We must use Age to predict the missing BP.
# Simple Mean would be biased because mostly young people (with lower BP) are missing.

df_mar = pd.read_csv('/kaggle/input/different-types-of-missing-data/dataset_mar.csv')

# Initialize IterativeImputer
# This basically runs a regression: Blood_Pressure = a * Age + b
imputer_mar = IterativeImputer(max_iter=10, random_state=0)

# We use all columns (Age and BP) so the model learns the relationship
df_mar_imputed = imputer_mar.fit_transform(df_mar[['Age', 'Blood_Pressure']])

# Update DataFrame
df_mar[['Age', 'Blood_Pressure']] = df_mar_imputed

print("MAR (Blood_Pressure) handled using Age correlations (Iterative Imputer).")
print(df_mar.head())
print("\n")

In [None]:
# 3. HANDLING MNAR (Missing Not At Random)
# Dataset: dataset_mnar.csv
# Strategy: "Flag and Fill" (Add Indicator + Simple Imputation)
# Why: The values are missing because they are HIGH. Imputing the median 
# will underestimate the true values. 
# We add a binary flag "Income_Missing" so the machine learning model knows 
# "This value was hidden" and can treat it differently.

df_mnar = pd.read_csv('/kaggle/input/different-types-of-missing-data/dataset_mnar.csv')

# Step A: Add the Indicator Flag (Critical for MNAR)
# 1 = Value was missing, 0 = Value was present
df_mnar['Income_Missing_Flag'] = df_mnar['Income'].isnull().astype(int)

# Step B: Fill the missing value 
# We use an arbitrary constant (like -1) or the Median. 
# Since we have the Flag, using Median is safe because the model relies on the Flag.
imputer_mnar = SimpleImputer(strategy='median')

df_mnar['Income'] = imputer_mnar.fit_transform(df_mnar[['Income']])

print("MNAR (Income) handled using Flagging + Median.")
print(df_mnar.head())
print("\n")