# Missingness Evaluation

Use this code to evaluate missingness. This should be used as a general analysis and deeper analysis is recomended.
Establishes a heatmap to measure correlations in missingness

In [None]:
def show_missing(df, title, sort_by="pct_missing", ascending=False, top=None):
    total = len(df)
    n_miss = df.isna().sum()
    out = (
        pd.DataFrame({
            "total_rows": total,
            "n_missing": n_miss,
            "pct_missing": (n_miss / total) * 100,
            "n_non_null": total - n_miss
        })
        .loc[n_miss.gt(0)]
        .sort_values(sort_by, ascending=ascending)
    )
    if top is not None:
        out = out.head(top)
    print(f"\n{title}")
    if out.empty:
        print("No missing values.")
    else:
        display(out.style.format({"pct_missing": "{:.2f}%"}))
    return out

def plot_missingness(df):
    # bar of missing counts
    miss = df.isna().sum().sort_values(ascending=False)
    plt.figure(figsize=(10,4))
    miss.plot(kind="bar")
    plt.ylabel("Missing count"); plt.title("Missing per column")
    plt.tight_layout(); plt.show()

    # heat map of missingness pattern (rows Ã— columns)
    null_mat = df.isna().astype(int)
    plt.figure(figsize=(10,4))
    plt.imshow(null_mat, aspect="auto", interpolation="nearest")
    plt.xticks(range(len(df.columns)), df.columns, rotation=90); plt.yticks([])
    plt.title("Missingness heat map (1=missing, 0=present)")
    plt.tight_layout(); plt.show()

    # correlation of missingness (which columns tend to be missing together)
    mcorr = df.isna().astype(int).corr()
    plt.figure(figsize=(8,6))
    im = plt.imshow(mcorr, vmin=-1, vmax=1)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(mcorr.columns)), mcorr.columns, rotation=90)
    plt.yticks(range(len(mcorr.columns)), mcorr.columns)
    plt.title("Correlation of Missingness")
    plt.tight_layout(); plt.show()

# use it
show_missing(df, "Missing summary BEFORE filling")
plot_missingness(df)

In [None]:
def clean_analyze_missing_with_heatmap(df, pattern_cols=None, cat_fill="Unknown"):
    """
    Cleans '?' placeholders, imputes missing values, analyzes patterns,
    and visualizes missing data with a heatmap.

    Parameters:
    -----------
    df : pd.DataFrame
        Your dataset.
    pattern_cols : list, optional
        Columns to check for patterns in missing rows (e.g., 'fraud_reported').
        If None, will use all categorical columns.
    cat_fill : str, optional
        Category name to use for missing categorical values.
    """

    # STEP 1: Replace '?' with NaN
    df.replace("?", np.nan, inplace=True)

    # STEP 2: Missing value summary BEFORE filling
    print("\n=== Missing value summary BEFORE filling ===")
    missing_before = df.isna().sum()
    print(missing_before[missing_before > 0])

    # Default pattern columns
    if pattern_cols is None:
        pattern_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # STEP 3: Pattern analysis
    print("\n=== Pattern analysis for missing values ===")
    for col in df.columns:
        if df[col].isna().sum() > 0:
            print(f"\n--- Missing in '{col}' ({df[col].isna().sum()} rows) ---")
            for pcol in pattern_cols:
                if pcol != col:
                    print(f"\nDistribution of '{pcol}' when '{col}' is missing:")
                    print(df.loc[df[col].isna(), pcol].value_counts(dropna=False))

    # STEP 4: Visualize missingness
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isna(), cbar=False, cmap="coolwarm")
    plt.title("Missing Data Heatmap")
    plt.xlabel("Columns")
    plt.ylabel("Rows")
    plt.show()

    # Optional: Correlation of missingness between columns
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.isna().corr(), annot=True, cmap="coolwarm")
    plt.title("Correlation of Missingness Between Columns")
    plt.show()

    # STEP 5: Fill missing values
    for col in df.select_dtypes(include=['object', 'category']).columns:
        if df[col].isna().sum() > 0:
            df[col] = df[col].astype('category').cat.add_categories([cat_fill])
            df[col].fillna(cat_fill, inplace=True)

    for col in df.select_dtypes(include=np.number).columns:
        if df[col].isna().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)

    # STEP 6: Missing value summary AFTER filling
    print("\n=== Missing value summary AFTER filling ===")
    missing_after = df.isna().sum()
    print(missing_after[missing_after > 0])

    return df