# Null Handling 


## Import library

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer

### There are 5 requirements 
1. KNN 
2. Mean
3. Median
4. Mode
5. Dropping

In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer

def handle_nulls(df, method=None, n_neighbors=5):
    """
    Handle missing values in a DataFrame (numerical & categorical).
    - Auto-detects column types.
    - KNN applied once for all numeric columns (efficient).
    - Detects simple ordinal columns (few unique categories).
    - Integer columns: mean is rounded *before* imputation.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset to clean.
    method : str, dict, or None
        - str: same method for all columns ('mean', 'median', 'mode', 'knn', 'drop').
        - dict: per-column method, e.g. {'Age': 'median', 'Department': 'mode'}.
        - None: auto (mean for numeric, mode for categorical/ordinal).
    n_neighbors : int
        Neighbors for KNN (numeric only).

    Returns:
    --------
    df : pandas.DataFrame
        DataFrame with NaNs handled.
    """
    df = df.copy()

    # Identify categorical and numeric columns
    cat_cols = df.select_dtypes(include='object').columns
    num_cols = df.select_dtypes(exclude='object').columns

    # Detect ordinal (categorical with few unique values)
    ordinal_cols = [col for col in cat_cols if df[col].nunique() <= 10]

    # --- If KNN: apply once for all numeric columns ---
    if method == 'knn':
        if len(num_cols) > 0:
            knn_imputer = KNNImputer(n_neighbors=n_neighbors)
            df[num_cols] = knn_imputer.fit_transform(df[num_cols])
            
        # For categorical → mode
        for col in cat_cols:
            imp = SimpleImputer(strategy='most_frequent')
            df[[col]] = imp.fit_transform(df[[col]])
        return df

    # --- Handle column by column for other methods ---
    for col in df.columns:
        if df[col].isnull().sum() == 0:
            continue

        # -- if column is skipped (method[col] = None)
        if isinstance(method, dict) and col in method and method[col] is None:
            continue  # not processed
        
        # Determine method (per-column dict or global)
        if isinstance(method, dict):
            chosen_method = method.get(col, None)
        else:
            chosen_method = method

        # Auto fallback
        if chosen_method is None:
            if col in num_cols:
                chosen_method = 'mean'
            elif col in ordinal_cols:
                chosen_method = 'mode'
            else:
                chosen_method = 'mode'

        # Apply imputations
        if chosen_method == 'mean':
            if col in num_cols:
                # Deteksi apakah semua nilai valid integer (meskipun dtype float karena NaN)
                is_effectively_integer = np.all(df[col].dropna() % 1 == 0)
                
                if is_effectively_integer:
                    mean_val = df[col].mean(skipna=True)
                    mean_val = round(mean_val)   
                    imp = SimpleImputer(strategy='constant', fill_value=mean_val)
                    df[[col]] = imp.fit_transform(df[[col]])
                else:
                    imp = SimpleImputer(strategy='mean')
                    df[[col]] = imp.fit_transform(df[[col]])
            else:
                imp = SimpleImputer(strategy='most_frequent')
                df[[col]] = imp.fit_transform(df[[col]])

        elif chosen_method == 'median':
            if col in num_cols:
                imp = SimpleImputer(strategy='median')
                df[[col]] = imp.fit_transform(df[[col]])
            else:
                imp = SimpleImputer(strategy='most_frequent')
                df[[col]] = imp.fit_transform(df[[col]])

        elif chosen_method == 'mode':
            imp = SimpleImputer(strategy='most_frequent')
            df[[col]] = imp.fit_transform(df[[col]])

        elif chosen_method == 'drop':
            df.dropna(subset=[col], inplace=True)

        else:
            raise ValueError(f"Unknown method: {chosen_method}")

    return df


Example Using 

In [4]:
data = {
    'Age': [25, np.nan, 30, 22, np.nan, 40],
    'Salary': [5000, 6000, np.nan, 5500, 5200, np.nan],
    'Level': ['Junior', 'Mid', np.nan, 'Senior', 'Junior', 'Mid'],  # ordinal-like
    'Department': ['HR', 'Finance', np.nan, 'HR', 'IT', 'Finance']
}
df = pd.DataFrame(data)

In [5]:
# 1) Auto mode (mean for numeric, mode for categorical/ordinal)
df_auto = handle_nulls(df)
print("Auto Mode:\n", df_auto)

Auto Mode:
     Age  Salary   Level Department
0  25.0  5000.0  Junior         HR
1  29.0  6000.0     Mid    Finance
2  30.0  5425.0  Junior    Finance
3  22.0  5500.0  Senior         HR
4  29.0  5200.0  Junior         IT
5  40.0  5425.0     Mid    Finance


Example Using KNN

In [6]:
# Dataset dengan missing values
data_knn = {
    'Age': [25, np.nan, 30, 22, 28, np.nan, 40],
    'Salary': [5000, 6000, np.nan, 5500, 5200, 5800, np.nan],
    'Experience': [1, 3, np.nan, 2, 4, np.nan, 5]
}

df_knn_test = pd.DataFrame(data_knn)
print("Original Dataset:\n", df_knn_test)

Original Dataset:
     Age  Salary  Experience
0  25.0  5000.0         1.0
1   NaN  6000.0         3.0
2  30.0     NaN         NaN
3  22.0  5500.0         2.0
4  28.0  5200.0         4.0
5   NaN  5800.0         NaN
6  40.0     NaN         5.0


In [7]:
df_knn_filled = handle_nulls(df_knn_test, method='knn', n_neighbors=3)
print("\nAfter KNN Imputation:\n", df_knn_filled) # Jalankan handle_nulls dengan KNN (numeric only)


After KNN Imputation:
     Age       Salary  Experience
0  25.0  5000.000000    1.000000
1  30.0  6000.000000    3.000000
2  30.0  5233.333333    2.333333
3  22.0  5500.000000    2.000000
4  28.0  5200.000000    4.000000
5  25.0  5800.000000    3.000000
6  40.0  5400.000000    5.000000


Example Using Custom

In [8]:
data_custom = {
    'Age': [25, np.nan, 30, 22, np.nan, 40],
    'Salary': [5000, 6000, np.nan, 5500, 5200, np.nan],
    'Department': ['HR', 'Finance', np.nan, 'HR', 'IT', 'Finance']
}
df = pd.DataFrame(data_custom)

In [9]:
# Specific: use median for Age, mode for Department
cleaned_custom = handle_nulls(df, method={'Age': 'mean', 'Salary': 'median'})
print(cleaned_custom)

    Age  Salary Department
0  25.0  5000.0         HR
1  29.0  6000.0    Finance
2  30.0  5350.0    Finance
3  22.0  5500.0         HR
4  29.0  5200.0         IT
5  40.0  5350.0    Finance


- Kolom Age 

Karena Age adalah numerik, handle_nulls menghitung median semua nilai non-missing.

Nilai median dari [25, 30, 22, 40] adalah (25 + 30)/2 = 27.5 karena ada 4 data (genap) sehingga median adalah rata-rata dua nilai tengah.

Nilai NaN di Age diganti dengan 27.5, tanpa pembulatan karena median default tidak membulatkan.

- Kolom Salary 

Default method untuk numerik adalah mean, jadi diisi dengan rata-rata (5425.0 untuk NaN).

- Kolom Departement 

Modus = Finance

Contoh Pemakaian untuk None di Method Dict

In [10]:
df = pd.DataFrame({
    'Age': [25, np.nan, 30],
    'Score': [100, np.nan, 90],
    'Dept': ['HR', np.nan, 'IT']
})

cleaned = handle_nulls(df, method={'Age': 'median', 'Score': None, 'Dept': 'mode'})
print(cleaned)

    Age  Score Dept
0  25.0  100.0   HR
1  27.5    NaN   HR
2  30.0   90.0   IT
