© 2025 Vanargo · License: MIT. See the `LICENSE` file in the repository root.

# --- 01. Data loading and EDA --- #

Loading, cleaning, exploratory data analysis, and feature engineering for the Adult (Census Income) dataset.

In [None]:
# --- Imports and basyc style --- #

import logging
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import chi2_contingency

# unified clean plotting style #
plt.rcParams.update(
    {
        "figure.figsize": (8, 4),
        "axes.spines.top": False,
        "axes.spines.right": False,
    }
)
sns.set_context("notebook")

# silence loggers of popular libraries #
for name in ("lightgbm", "xgboost", "matplotlib", "numba"):
    logging.getLogger(name).setLevel(logging.ERROR)

# relevant warning filters for this notebook #
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message=".*Glyph .* missing from current font.*")

# controlled randomness for reproducibilty #
RNG = np.random.default_rng(42)

In [None]:
# --- Project paths bootstrap --- #

import sys

# define project root relative to this notebook #
ROOT = Path.cwd().resolve().parents[0]
DATA_DIR = ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
REPORTS_DIR = DATA_DIR / "reports"

# ad project root to sys.path if missing #
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print(f"[ok] project root: {ROOT}")
print(f"[ok] data dir: {DATA_DIR}")

In [None]:
# --- Notebook preamble: UX, magics, helpers --- #

# IPython magics #
%load_ext autoreload
%autoreload 2
%matplotlib inline

# display options #
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")

# reproducibility #
if "RNG" not in globals():
    RNG = np.random.default_rng(42)

# figures directory for this notebook #
FIG_DIR_01 = REPORTS_DIR / "figures_01"
FIG_DIR_01.mkdir(parents=True, exist_ok=True)


def save_fig(fig, name: str, dpi: int = 150) -> Path:
    """
    Save matplotlib figure to reports/figures_01 as <name>.png.
    """
    p = FIG_DIR_01 / f"{name}.png"
    fig.savefig(p, dpi=dpi, bbox_inches="tight")
    print(f"[saved] {p}")
    return p


def log_df(df: pd.DataFrame, name: str) -> None:
    """
    Compact dataframe log: shape and column list.
    """
    print(f"[{name}] shape={df.shape} cols={list(df.columns)}")


# sanity print #
print(f"[env] pandas {pd.__version__} | numpy {np.__version__} | seaborn {sns.__version__}")
print(f"[figures] {FIG_DIR_01}")

In [None]:
# --- Loading Adult dataset (train/test) --- #

# paths #
TRAIN_PATH = RAW_DIR / "adult.data"
TEST_PATH = RAW_DIR / "adult.test"

# column names from UCI documentation #
COLS = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income",
]

# common reading parameters #
READ_OPTS = dict(
    names=COLS,
    na_values=["?", " ?"],
    skipinitialspace=True,
    # read as strings, then cast types explicitly
    dtype=str,
)

# train #
df_train = pd.read_csv(TRAIN_PATH, **READ_OPTS)
df_train["source"] = "train"

# test (the first line in the file is a header, skip it) #
df_test = pd.read_csv(TEST_PATH, **READ_OPTS, skiprows=1)
df_test["source"] = "test"

# merge #
df = pd.concat([df_train, df_test], ignore_index=True)

# strip spaces and clean target values #
df = df.apply(lambda c: c.str.strip() if c.dtype == "object" else c)
df["income"] = df["income"].replace({"<=50K.": "<=50K", ">50K.": ">50K"})

# explicit type casting for numeric features #
NUMERIC_COLS = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
for c in NUMERIC_COLS:
    # NaN if garbage
    df[c] = pd.to_numeric(df[c], errors="coerce")

# logging and preview #
log_df(df, "adult_full")
df.head()

In [None]:
# --- Initial overview --- #

print("Shape:", df.shape)

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
obj_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

if num_cols:
    display(df[num_cols].describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95]).T)
else:
    print("[info] numeric columns not found")

if obj_cols:
    display(df[obj_cols].describe().T)
else:
    print("[info] object/category columns not found")

In [None]:
# --- Distributions of numeric features --- #

num_cols = df.select_dtypes(include=np.number).columns.tolist()
print(f"[ok] numeric columns: {len(num_cols)}")

if not num_cols:
    print("[warn] numeric columns not found")
else:
    # individual histograms #
    for col in num_cols:
        fig, ax = plt.subplots(figsize=(6, 3))
        sns.histplot(data=df, x=col, kde=True, bins=30, ax=ax, color="steelblue")
        ax.set_title(f"Distribution of {col}")
        ax.set_xlabel(col)
        ax.set_ylabel("Count")
        ax.grid(alpha=0.3)
        plt.tight_layout()
        save_fig(fig, f"hist_{col}")
        plt.close(fig)

    # combined grid plot #
    ncols = 3
    nrows = int(np.ceil(len(num_cols) / ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(ncols * 4, nrows * 3))
    axes = axes.flatten()

    for i, col in enumerate(num_cols):
        sns.histplot(data=df, x=col, kde=True, bins=30, ax=axes[i], color="steelblue")
        axes[i].set_title(col)
        axes[i].grid(alpha=0.3)

    # turn off unused axes #
    for j in range(i + 1, len(axes)):
        axes[j].axis("off")

    plt.tight_layout()
    save_fig(fig, "hist_numeric_all")
    plt.close(fig)

In [None]:
# --- Correlation between numeric features --- #

num_cols = df.select_dtypes(include=np.number).columns.tolist()

if not num_cols:
    print("[warn] numeric columns not found, correlation skipped")
else:
    corr = df[num_cols].corr(method="pearson")

    fig, ax = plt.subplots(figsize=(6, 5))
    sns.heatmap(
        corr,
        cmap="vlag",
        center=0,
        annot=True,
        fmt=".2f",
        linewidths=0.5,
        cbar_kws={"shrink": 0.8},
        ax=ax,
    )
    ax.set_title("Correlation between numeric features (Pearson)")
    plt.tight_layout()
    save_fig(fig, "corr_numeric")
    plt.close(fig)

    display(corr)

In [None]:
# --- Associations between categorical features (Cramér’s V) --- #


cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()


def cramers_v(x: pd.Series, y: pd.Series) -> float:
    """
    Computes Cramér’s V for two categorical features.
    """
    tbl = pd.crosstab(x, y, dropna=False)
    chi2, _, _, _ = chi2_contingency(tbl, correction=False)
    n = tbl.values.sum()
    phi2 = chi2 / n
    r, k = tbl.shape
    phi2corr = max(0, phi2 - (k - 1) * (r - 1) / (n - 1))
    rcorr = r - (r - 1) ** 2 / (n - 1)
    kcorr = k - (k - 1) ** 2 / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))


if len(cat_cols) < 2:
    print("[info] not enough categorical features for Cramér’s V matrix")
else:
    # Cramér’s V matrix #
    cramers_mat = pd.DataFrame(
        np.zeros((len(cat_cols), len(cat_cols))), index=cat_cols, columns=cat_cols
    )

    for i, col1 in enumerate(cat_cols):
        for j, col2 in enumerate(cat_cols):
            if i >= j:
                continue
            v = cramers_v(df[col1], df[col2])
            cramers_mat.loc[col1, col2] = cramers_mat.loc[col2, col1] = v

    # visualization #
    fig, ax = plt.subplots(figsize=(7, 6))
    sns.heatmap(
        cramers_mat,
        cmap="crest",
        vmin=0,
        vmax=1,
        annot=True,
        fmt=".2f",
        square=True,
        cbar_kws={"shrink": 0.7},
        ax=ax,
    )
    ax.set_title("Cramér’s V between categorical features")
    plt.tight_layout()
    save_fig(fig, "cramers_v_matrix")
    plt.close(fig)

    display(cramers_mat.round(2))

In [None]:
# --- Distributions of categorical features --- #

cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
print(f"[ok] categorical columns: {len(cat_cols)}")

if not cat_cols:
    print("[warn] no categorical columns found")
else:
    for col in cat_cols:
        # if cardinality is high, plot on top-20 values #
        n_unique = df[col].nunique(dropna=True)
        if n_unique > 20:
            top_vals = df[col].value_counts(dropna=False).head(20).sort_values(ascending=False)
            data_plot = pd.DataFrame({"value": top_vals.index, "count": top_vals.values})
            title = f"{col} (top 20 из {n_unique})"
        else:
            data_plot = df[[col]].copy()
            title = col

        fig, ax = plt.subplots(figsize=(7, 3))
        sns.countplot(
            data=data_plot, x=col if n_unique <= 20 else "value", ax=ax, color="steelblue"
        )
        ax.set_title(f"Distribution: {title}")
        ax.set_xlabel(col)
        ax.set_ylabel("Count")
        ax.tick_params(axis="x", rotation=45)
        ax.grid(alpha=0.3)
        plt.tight_layout()
        save_fig(fig, f"bar_{col}")
        plt.close(fig)

In [None]:
# --- Categorical features vs target variable (income) --- #

target_col = "income"
cat_cols = [c for c in df.select_dtypes(include=["object", "category"]).columns if c != target_col]

if target_col not in df.columns:
    print(f'[warn] target "{target_col}" not found')
elif not cat_cols:
    print("[warn] no categorical columns found")
else:
    for col in cat_cols:
        n_unique = df[col].nunique(dropna=True)
        if n_unique > 20:
            print(f"[skip] {col}: {n_unique} unique values")
            continue

        fig, ax = plt.subplots(figsize=(7, 3))
        sns.countplot(data=df, x=col, hue=target_col, ax=ax, palette="Set2")
        ax.set_title(f"{col} vs {target_col}")
        ax.set_xlabel(col)
        ax.set_ylabel("Count")
        ax.tick_params(axis="x", rotation=45)
        ax.legend(title=target_col, loc="upper right")
        ax.grid(alpha=0.3)
        plt.tight_layout()
        save_fig(fig, f"bar_{col}_vs_{target_col}")
        plt.close(fig)

In [None]:
# --- Numerical features vs target variable (income) --- #

target_col = "income"
num_cols = df.select_dtypes(include=np.number).columns.tolist()

if target_col not in df.columns:
    print(f'[warn] target "{target_col}" not found')
elif not num_cols:
    print("[warn] numeric columns not found")
else:
    for col in num_cols:
        fig, ax = plt.subplots(figsize=(6, 3))
        sns.boxplot(
            data=df,
            x=target_col,
            y=col,
            hue=target_col,  # fix for FutureWarning
            legend=False,  # disable legend (it duplicates labels)
            palette="Set2",
            showfliers=False,
            ax=ax,
        )
        ax.set_title(f"{col} vs {target_col}")
        ax.set_xlabel(target_col)
        ax.set_ylabel(col)
        ax.grid(alpha=0.3)
        plt.tight_layout()
        save_fig(fig, f"box_{col}_vs_{target_col}")
        plt.close(fig)

In [None]:
# --- Missing values analysis --- #

# count missing values #
na_stats = df.isna().sum()
na_stats = na_stats[na_stats > 0].sort_values(ascending=False)

if na_stats.empty:
    print("[ok] no missing values detected.")
else:
    na_df = pd.DataFrame(
        {"missing_count": na_stats, "missing_pct": (na_stats / len(df) * 100).round(2)}
    )
    display(na_df)

    # visualization #
    fig, ax = plt.subplots(figsize=(7, 3))
    sns.barplot(data=na_df.reset_index(), x="index", y="missing_pct", color="steelblue", ax=ax)
    ax.set_title("Percentage of missing values by feature (%)")
    ax.set_xlabel("Feature")
    ax.set_ylabel("Missing (%)")
    ax.tick_params(axis="x", rotation=45)
    ax.grid(alpha=0.3)
    plt.tight_layout()
    save_fig(fig, "missing_values")
    plt.close(fig)

In [None]:
# --- Outliers by IQR and binary flags --- #

NUM_COLS = df.select_dtypes(include=np.number).columns.tolist()
OUTLIER_PREFIX = "is_outlier_"


def iqr_flags(s: pd.Series, k: float = 1.5) -> pd.Series:
    """Return 0/1 outlier flag based on the IQR rule."""
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    if not np.isfinite(iqr) or iqr == 0:
        return pd.Series(np.zeros(len(s), dtype=int), index=s.index)
    lo, hi = q1 - k * iqr, q3 + k * iqr
    return ((s < lo) | (s > hi)).astype(int)


if not NUM_COLS:
    print("[warn] numeric columns not found, outlier flags skipped")
else:
    for c in NUM_COLS:
        flag_col = f"{OUTLIER_PREFIX}{c}"
        df[flag_col] = iqr_flags(df[c])

    # brief summary of outlier share #
    flag_cols = [c for c in df.columns if c.startswith(OUTLIER_PREFIX)]
    outlier_share = (df[flag_cols].sum().sort_values(ascending=False) / len(df)).round(4)
    display(outlier_share.to_frame("share"))
    print(f"[ok] flags added: {len(flag_cols)}")

In [None]:
# --- Custom feature (feature engineering) --- #


def add_custom_features(dfin: pd.DataFrame) -> pd.DataFrame:
    df_ = dfin.copy()

    # net capital #
    if {"capital_gain", "capital_loss"}.issubset(df_.columns):
        df_["capital_net"] = (df_["capital_gain"].fillna(0) - df_["capital_loss"].fillna(0)).astype(
            float
        )
    else:
        df_["capital_net"] = np.nan

    # useful ratios #
    if {"hours_per_week", "education_num"}.issubset(df_.columns):
        with np.errstate(divide="ignore", invalid="ignore"):
            df_["hours_per_edu"] = df_["hours_per_week"].astype(float) / df_[
                "education_num"
            ].replace(0, np.nan).astype(float)

    # non-zero capital indicator #
    if "capital_net" in df_.columns:
        df_["has_capital"] = (df_["capital_net"].fillna(0) != 0).astype(int)

    return df_


df = add_custom_features(df)
log_df(df, "adult_with_custom_features")

# quick preview #
df[
    [
        "capital_gain",
        "capital_loss",
        "capital_net",
        "hours_per_week",
        "education_num",
        "hours_per_edu",
        "has_capital",
    ]
].head()

In [None]:
# --- Age binning (age_group) --- #

if "age" not in df.columns:
    print('[warn] "age" not in columns, age_group skipped')
else:
    bins = [0, 24, 34, 44, 54, 64, np.inf]
    labels = ["18–24", "25–34", "35–44", "45–54", "55–64", "65+"]
    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=True, include_lowest=True)
    df["age_group"] = df["age_group"].astype("category")
    print("[ok] age_group created")

# Export EDA dataset #

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
p_parquet = PROCESSED_DIR / "adult_eda.parquet"
p_csv = PROCESSED_DIR / "adult_eda.csv"

# export #
df.to_parquet(p_parquet, index=False)
df.to_csv(p_csv, index=False)

print(f"[saved] {p_parquet}")
print(f"[saved] {p_csv}")

In [None]:
# --- Export of key figures (control check) --- #

expected = [
    "hist_numeric_all.png",
    "corr_numeric.png",
    "cramers_v_matrix.png",
    "missing_values.png",
]
# plus individual ones: hist_*, bar_*, box_* - already saved in previous cells #

missing = [name for name in expected if not (FIG_DIR_01 / name).exists()]
if missing:
    print("[warn] missing figures:", missing)
else:
    print("[ok] key figures present")

# --- Final EDA Summary ---

**Data quality:**
1. Missing values are concentrated in three features: `workclass`, `occupation`, `native_country` (see `missing_values.png`). Other fields are complete.
2. Categorical values are standardized; rare categories are mostly found in `native_country`. This is relevant for grouping into “Other” before modeling.
3. No explicit duplicates by key fields were detected in the current review.
4. The target variable is imbalanced: `>50k` occurs significantly less often than `<=50k` (see `bar_income.png`). Stratification and baseline control will be required for metric evaluation.

**Basic distributions of numeric features:**
1. Age: right-skewed, concentrated between 25–45; higher-income individuals are more frequent in the 35–55 range (see `hist_age.png` and `box_age_vs_income.png`).
2. `hours-per-week`: modal region around 40 hours, “heavy tails” for overtime; median higher for the `>50k` group (see `hist_hours_per_week.png`, `box_hours_per_week_vs_income.png`).
3. `education_num`: monotonically increasing relationship with income; higher values typical for `>50k` (see `hist_education_num.png`, `box_education_num_vs_income.png`).
4. `capital_gain` and `capital_loss`: extremely sparse, dominated by zeros with rare large values; non-zero values are strongly associated with `>50k` (see `hist_capital_gain.png`, `hist_capital_loss.png`, `box_capital_gain_vs_income.png`, `box_capital_loss_vs_income.png`).
5. `fnlwgt`: broad distribution with no clearly interpretable relationship to income (see `hist_fnlwgt.png`, `box_fnlwgt_vs_income.png`).
6. Numeric correlations are generally low but noticeable for `education_num`, `hours-per-week`, and presence of `capital_gain` (see `corr_numeric.png`).

**Categorical features: profile and relationship with income:**
1. Education: bachelor’s degree and above more common for `>50k`; lower and middle education levels dominate `<=50k` (see `bar_education.png`, `bar_education_vs_income.png`).
2. Marital status: `Married-civ-spouse` dominates among `>50k`; `Never-married` and `Divorced` dominate `<=50k` (see `bar_marital_status.png`, `bar_marital_status_vs_income.png`).
3. Household role (`relationship`): categories linked to being a spouse or household head are more frequent in `>50k` (see `bar_relationship.png`, `bar_relationship_vs_income.png`).
4. Occupation: `Exec-managerial`, `Prof-specialty`, partly `Tech-support` and `Sales` show higher shares of `>50k`; `Handlers-cleaners`, `Other-service`, and `Machine-op-inspct` dominate `<=50k` (see `bar_occupation.png`, `bar_occupation_vs_income.png`).
5. Sex: men more often in `>50k`, women in `<=50k` (see `bar_sex.png`, `bar_sex_vs_income.png`).
6. Race: income differences exist but are smaller compared to education, marital status, or occupation (see `bar_race.png`, `bar_race_vs_income.png`).
7. `workclass`: private and government sectors show higher `>50k` proportions, while `Without-pay`/`Never-worked` are almost always `<=50k` (see `bar_workclass.png`, `bar_workclass_vs_income.png`).
8. `native-country`: U.S. dominates the dataset; several countries have small samples, limiting statistical interpretation (see `bar_native_country.png`).
9. `source`: distributions align with expectations from the train/test split (see `bar_source.png`).

**Feature associations:**
1. The Cramér’s V matrix shows clear relationships: `marital_status <-> relationship`, and clusters of employment features `occupation <-> workclass` with ties to education level (see `cramers_v_matrix.png`). This indicates partial redundancy and potential multicollinearity after one-hot encoding.
2. For numeric features, strong linear correlations are absent, reducing collinearity risk in linear models (see `corr_numeric.png`).

**Key predictors and effect direction:**
1. Most informative predictors for `>50k`: nonzero `capital_gain`, high `education_num`, occupations `Exec-managerial`/`Prof-specialty`, marital status `Married-civ-spouse`, and higher `hours-per-week`.
2. Moderately informative: `sex` (male), certain `workclass` and `relationship` categories.
3. Weak or ambiguous: `fnlwgt` and long tails of `capital-loss` with unstable effect.
4. Age effect is nonlinear: probability of `>50k` rises until mid-age, then stabilizes or plateaus (see `hist_age.png`, `box_age_vs_income.png`).

**Modeling implications:**
1. Missing values in `workclass`/`occupation`/`native_country` must be handled, and rare categories (especially in `native-country`) aggregated for stable estimates.
2. Target imbalance requires stratified validation and metrics robust to skew.
3. High sparsity of `capital_gain`/`capital_loss` suggests binary indicators for presence and possible log-transform for nonzero values.
4. Categorical clusters (`occupation`–`workclass`–`education`) call for regularization and control of one-hot width; tree-based and boosting models can naturally capture nonlinearities and interactions.
5. `fnlwgt` appears weakly informative and can likely be dropped without performance loss.

**Conclusions:**
1. Income `>50k` is primarily associated with human capital and employment profile: education, occupation type, marital status, working hours, and presence of investment income (`capital_gain`).
2. Gender and some demographic factors play a role but are secondary to education, profession, and workload.
3. Data quality is sufficient for modeling after targeted handling of missing values and rare categories; models capable of nonlinear and sparse signal capture are expected to perform best.