In [41]:
# =========================================================
# DATA PREPROCESSING & FEATURE ENGINEERING — Adult Dataset
# =========================================================
# This single cell is organized into clear "steps" for Jupyter.
# If you prefer, split each STEP into separate notebook cells.
# =========================================================

# ---------- STEP 0. Imports & Config ----------
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import mutual_info_classif  # <-- FIXED: correct import

# Robust OHE across sklearn versions (sparse_output vs sparse)
def make_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        # Older sklearn versions use `sparse` instead of `sparse_output`
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

# Use YOUR Windows path (keep the r prefix!)
DATA_PATH = r"C:\Users\user\Data-Science-main\adult_with_headers.csv"
assert os.path.exists(DATA_PATH), f"File not found at {DATA_PATH}. Update DATA_PATH."

In [43]:
# ---------- STEP 1. Load & Basic EDA ----------
df_raw = pd.read_csv(DATA_PATH)
df = df_raw.copy()

# Clean headers
df.columns = [c.strip().lower() for c in df.columns]

# Replace "?" (Adult dataset placeholder) with NaN; strip whitespace
df.replace("?", np.nan, inplace=True)
for c in df.select_dtypes(include=["object"]).columns:
    df[c] = df[c].astype(str).str.strip().replace({"nan": np.nan})

# Try to identify the target column robustly (commonly 'income')
possible_targets = [c for c in df.columns if "income" in c.lower() or c.lower() in ["target", "class"]]
TARGET_COL = possible_targets[0] if possible_targets else df.columns[-1]
print(f"Detected TARGET_COL: {TARGET_COL!r}")

print("\n=== SHAPE ===")
print(df.shape)

print("\n=== DTYPE INFO ===")
print(df.dtypes)

print("\n=== MISSING VALUES (count) ===")
print(df.isna().sum())

print("\n=== TARGET DISTRIBUTION ===")
print(df[TARGET_COL].value_counts(dropna=False))

# Coerce likely numeric columns just in case they were parsed as object
likely_numeric = ["age","fnlwgt","education-num","capital-gain","capital-loss","hours-per-week"]
for col in likely_numeric:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

print("\n=== NUMERIC SUMMARY (describe) ===")
print(df.select_dtypes(include=[np.number]).describe().T)

Detected TARGET_COL: 'income'

=== SHAPE ===
(32561, 15)

=== DTYPE INFO ===
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

=== MISSING VALUES (count) ===
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

=== TARGET DISTRIBUTION ===
income
<=50K    24720
>50K      7841
Name: count, dtype: int64

=== NUMERIC SUMMARY (describe) ===
                  count           mean            std      mi

In [45]:
# ---------- STEP 2. Handle Missing Values ----------
# Define imputers (best practices)
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Separate features by dtype
all_features = [c for c in df.columns if c != TARGET_COL]
num_cols = [c for c in all_features if pd.api.types.is_numeric_dtype(df[c])]
cat_cols = [c for c in all_features if c not in num_cols]

print("\n=== NUMERIC COLS ===")
print(num_cols)
print("\n=== CATEGORICAL COLS ===")
print(cat_cols)

# Apply imputation for an imputed working copy (Pipelines will re-impute later)
df_imputed = df.copy()
df_imputed[num_cols] = num_imputer.fit_transform(df_imputed[num_cols])
df_imputed[cat_cols] = cat_imputer.fit_transform(df_imputed[cat_cols])

print("\nImputation complete (preview):")
print(df_imputed.head())


=== NUMERIC COLS ===
['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

=== CATEGORICAL COLS ===
['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

Imputation complete (preview):
    age         workclass    fnlwgt  education  education_num  \
0  39.0         State-gov   77516.0  Bachelors           13.0   
1  50.0  Self-emp-not-inc   83311.0  Bachelors           13.0   
2  38.0           Private  215646.0    HS-grad            9.0   
3  53.0           Private  234721.0       11th            7.0   
4  28.0           Private  338409.0  Bachelors           13.0   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners

In [47]:
# ---------- STEP 3. Scaling: Standard vs MinMax ----------
scaler_std = StandardScaler()
scaler_mm  = MinMaxScaler()

num_std = pd.DataFrame(
    scaler_std.fit_transform(df_imputed[num_cols]),
    columns=[f"{c}__std" for c in num_cols]
)
num_mm  = pd.DataFrame(
    scaler_mm.fit_transform(df_imputed[num_cols]),
    columns=[f"{c}__mm" for c in num_cols]
)

print("\n=== SCALING PREVIEW (first 3 rows of each) ===")
print(pd.concat([num_std.head(3), num_mm.head(3)], axis=1))

print("\nWhen to use which scaling:")
print("""
- StandardScaler: centers to mean 0 & unit variance. Good for models using dot products/distances or Gaussian assumptions
  (Logistic/Linear Regression, SVM, PCA, k-means, KNN).
- MinMaxScaler: rescales to [0,1]. Good when bounds matter, to preserve zero/positivity, or for neural nets.
""")



=== SCALING PREVIEW (first 3 rows of each) ===
   age__std  fnlwgt__std  education_num__std  capital_gain__std  \
0  0.030671    -1.063611            1.134739           0.148453   
1  0.837109    -1.008707            1.134739          -0.145920   
2 -0.042642     0.245079           -0.420060          -0.145920   

   capital_loss__std  hours_per_week__std   age__mm  fnlwgt__mm  \
0           -0.21666            -0.035429  0.301370    0.044302   
1           -0.21666            -2.222153  0.452055    0.048238   
2           -0.21666            -0.035429  0.287671    0.138113   

   education_num__mm  capital_gain__mm  capital_loss__mm  hours_per_week__mm  
0           0.800000           0.02174               0.0            0.397959  
1           0.800000           0.00000               0.0            0.122449  
2           0.533333           0.00000               0.0            0.397959  

When to use which scaling:

- StandardScaler: centers to mean 0 & unit variance. Good for models 

In [49]:
# ---------- STEP 4. Encoding: OHE (<5 cats) & Label/Ordinal (>=5 cats) ----------
# We'll treat label encoding as per-feature OrdinalEncoder (LabelEncoder is 1D and for targets).
low_card_cats  = [c for c in cat_cols if df_imputed[c].nunique(dropna=True) < 5]
high_card_cats = [c for c in cat_cols if c not in low_card_cats]

print("\nLow-cardinality categorical columns (OHE):", low_card_cats)
print("High-cardinality categorical columns (Ordinal):", high_card_cats)

ohe = make_ohe()
ord_enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

# A base transformer (for demonstration/preview)
pre_base = ColumnTransformer(
    transformers=[
        ("num", num_imputer, num_cols),
        ("ohe", Pipeline([("impute", cat_imputer), ("enc", ohe)]), low_card_cats),
        ("ord", Pipeline([("impute", cat_imputer), ("enc", ord_enc)]), high_card_cats),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

print("\nPros/Cons:")
print("""
One-Hot Encoding (OHE)
  + No artificial ordering; safe for linear & tree models.
  + Great for low-cardinality features.
  - Can blow up dimensionality with many categories.
  - More memory (often sparse).

Label/Ordinal Encoding
  + Compact (one column).
  + Useful for high-cardinality features to avoid OHE explosion.
  - Imposes arbitrary order; can mislead linear/distance-based models.
  - Trees tolerate this better, but still be cautious.
""")


Low-cardinality categorical columns (OHE): ['sex']
High-cardinality categorical columns (Ordinal): ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']

Pros/Cons:

One-Hot Encoding (OHE)
  + No artificial ordering; safe for linear & tree models.
  + Great for low-cardinality features.
  - Can blow up dimensionality with many categories.
  - More memory (often sparse).

Label/Ordinal Encoding
  + Compact (one column).
  + Useful for high-cardinality features to avoid OHE explosion.
  - Imposes arbitrary order; can mislead linear/distance-based models.
  - Trees tolerate this better, but still be cautious.



In [51]:
# ---------- STEP 5. Feature Engineering (WITH EXPLANATIONS) ----------
df_fe = df_imputed.copy()

# (F1) capital_net = capital-gain - capital-loss
# Rationale: condenses two skewed monetary signals into one net indicator of investment income → predictive of >50K.
if "capital-gain" in df_fe.columns and "capital-loss" in df_fe.columns:
    df_fe["capital_net"] = df_fe["capital-gain"].fillna(0) - df_fe["capital-loss"].fillna(0)
else:
    df_fe["capital_net"] = 0.0

# (F2) is_married (binary from 'marital-status')
# Rationale: marital status correlates with household structure & income; married often associates with higher income in census data.
if "marital-status" in df.columns:
    df_fe["is_married"] = df["marital-status"].astype(str).str.contains("Married", na=False).astype(int)
else:
    df_fe["is_married"] = 0

# (F3) age_bucket (categorical bins)
# Rationale: captures non-linear earning patterns across life stages (young/adult/mid_aged/senior).
if "age" in df_fe.columns:
    df_fe["age_bucket"] = pd.cut(
        df_fe["age"],
        bins=[0, 25, 45, 65, np.inf],
        labels=["young", "adult", "mid_aged", "senior"],
        right=False
    )

# (F4) long_hours (binary: ≥50 hrs/week)
# Rationale: proxies work intensity/overtime which can map to higher salary bands.
if "hours-per-week" in df_fe.columns:
    df_fe["long_hours"] = (df_fe["hours-per-week"] >= 50).astype(int)

print("\n=== FEATURE ENGINEERING — RATIONALE ===")
print("""\
1) capital_net = capital_gain - capital_loss
   Why: condenses two skewed investment-related features into a single net wealth signal.

2) is_married (1 if marital-status contains 'Married' else 0)
   Why: captures household/relationship factors often related to income.

3) age_bucket (young/adult/mid_aged/senior)
   Why: models non-linear income trends across career stages without forcing linearity.

4) long_hours (1 if hours-per-week ≥ 50)
   Why: indicates higher work intensity, often associated with higher earnings.
""")


=== FEATURE ENGINEERING — RATIONALE ===
1) capital_net = capital_gain - capital_loss
   Why: condenses two skewed investment-related features into a single net wealth signal.

2) is_married (1 if marital-status contains 'Married' else 0)
   Why: captures household/relationship factors often related to income.

3) age_bucket (young/adult/mid_aged/senior)
   Why: models non-linear income trends across career stages without forcing linearity.

4) long_hours (1 if hours-per-week ≥ 50)
   Why: indicates higher work intensity, often associated with higher earnings.



In [53]:
# ---------- STEP 6. Transform a skewed feature (log) ----------
# capital-gain is strongly right-skewed in Adult; log1p stabilizes variance & reduces outlier leverage.
if "capital-gain" in df_fe.columns:
    df_fe["capital_gain_log"] = np.log1p(df_fe["capital-gain"].clip(lower=0))
    if "capital_gain_log" not in num_cols:
        num_cols.append("capital_gain_log")
    print("\nApplied log1p to 'capital-gain' → 'capital_gain_log' due to strong right skew.")

# Prepare X, y
y = df_fe[TARGET_COL].copy()
X = df_fe.drop(columns=[TARGET_COL])

# Recompute categorical candidates (after FE)
new_cat_cols = [c for c in X.columns if not pd.api.types.is_numeric_dtype(X[c])]
low_card_cats  = [c for c in new_cat_cols if X[c].astype("object").nunique(dropna=True) < 5]
high_card_cats = [c for c in new_cat_cols if c not in low_card_cats]

print("\nUpdated low/high-card categorical splits after FE:")
print("Low-card (OHE):", low_card_cats)
print("High-card (Ordinal):", high_card_cats)


Updated low/high-card categorical splits after FE:
Low-card (OHE): ['sex', 'age_bucket']
High-card (Ordinal): ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']


In [55]:
# ---------- STEP 7. Outlier Detection: IsolationForest ----------
# Use numeric-only snapshot (already imputed) for outlier detection
X_num_only = X.select_dtypes(include=[np.number]).copy()
X_num_only = pd.DataFrame(SimpleImputer(strategy="median").fit_transform(X_num_only), columns=X_num_only.columns)

iso = IsolationForest(random_state=42, contamination="auto")
outlier_flag = iso.fit_predict(X_num_only)   # -1 outlier, 1 inlier
inlier_mask = outlier_flag == 1

print(f"\nIsolationForest flagged {np.sum(~inlier_mask)} potential outliers out of {len(inlier_mask)} rows.")

X_clean = X.loc[inlier_mask].reset_index(drop=True)
y_clean = y.loc[inlier_mask].reset_index(drop=True)

print("Outliers removed. Clean shapes:", X_clean.shape, y_clean.shape)
print("Note: Outliers can distort parameter estimates and distance-based models; removing them often stabilizes training.")



IsolationForest flagged 3879 potential outliers out of 32561 rows.
Outliers removed. Clean shapes: (28682, 17) (28682,)
Note: Outliers can distort parameter estimates and distance-based models; removing them often stabilizes training.


In [57]:
# ---------- STEP 8. PPS (Predictive Power Score) vs Correlation ----------
# PPS can fail if ppscore package isn't installed; fall back to Mutual Information (MI).
pps_available = False
try:
    import ppscore as pps
    pps_available = True
except Exception:
    print("\n[Note] ppscore not available. Install with:  pip install ppscore")
    print("Proceeding with Mutual Information as a robust fallback.\n")

# Helper: map target to binary for correlation
def to_binary_series(s):
    ss = s.astype(str).str.strip()
    # Adult: ' >50K' vs ' <=50K' (with or without spaces)
    if ss.str.contains(">50", regex=False).any():
        return (ss.str.contains(">50", regex=False)).astype(int)
    # If exactly two classes, map to 0/1
    vals = ss.unique().tolist()
    if len(vals) == 2:
        m = {vals[0]:0, vals[1]:1}
        return ss.map(m).astype(int)
    # Multi-class fallback
    return pd.factorize(ss)[0]

# 8a) Correlation (numeric only, Pearson)
y_bin = to_binary_series(y_clean)
num_only_clean = X_clean.select_dtypes(include=[np.number]).copy()
corr_df = pd.concat([num_only_clean, y_bin.rename("__target__")], axis=1)
corr_series = corr_df.corr(numeric_only=True)["__target__"].drop("__target__", errors="ignore")
corr_top = corr_series.sort_values(key=lambda s: s.abs(), ascending=False).head(10)

print("\n=== Top numeric correlations with target (absolute) ===")
print(corr_top)

# 8b) PPS if available; else MI on an encoded view of X_clean
if pps_available:
    # PPS expects a DataFrame with both features and the target column present
    df_pps = X_clean.copy()
    df_pps[y_clean.name] = y_clean

    pps_scores = []
    for col in X_clean.columns:
        try:
            sc = pps.score(df_pps, col, y_clean.name)["ppscore"]
        except Exception:
            sc = np.nan
        pps_scores.append((col, sc))
    pps_df = pd.DataFrame(pps_scores, columns=["feature", "pps"]).sort_values("pps", ascending=False)
    print("\n=== Top PPS features (feature → target) ===")
    print(pps_df.head(15))
else:
    # Mutual Information fallback (continuous + categorical, non-linear capable)
    ohe = make_ohe()
    ord_enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

    enc_pre = ColumnTransformer(
        transformers=[
            ("num", "passthrough", [c for c in X_clean.columns if pd.api.types.is_numeric_dtype(X_clean[c])]),
            ("ohe", Pipeline([("impute", SimpleImputer(strategy="most_frequent")), ("enc", ohe)]), low_card_cats),
            ("ord", Pipeline([("impute", SimpleImputer(strategy="most_frequent")), ("enc", ord_enc)]), high_card_cats),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )
    X_enc = enc_pre.fit_transform(X_clean)
    if hasattr(X_enc, "toarray"):
        X_enc = X_enc.toarray()
    mi_vals = mutual_info_classif(X_enc, y_bin, discrete_features=False, random_state=42)
    try:
        feat_names = enc_pre.get_feature_names_out()
    except Exception:
        feat_names = [f"f{i}" for i in range(X_enc.shape[1])]
    mi_df = pd.DataFrame({"feature": feat_names, "MI": mi_vals}).sort_values("MI", ascending=False)
    print("\n[Fallback] Top features by Mutual Information (proxy for PPS):")
    print(mi_df.head(15))

print("\nHow PPS vs Correlation differ?")
print("""
- Pearson correlation: linear-only, numeric-only, symmetric (X~Y).
- PPS: model-based, directional (X→Y), handles categorical & non-linear relations.
- Expect features with weak linear correlation to still rank high on PPS/MI if they are informative non-linearly.
""")


[Note] ppscore not available. Install with:  pip install ppscore
Proceeding with Mutual Information as a robust fallback.


=== Top numeric correlations with target (absolute) ===
education_num     0.296020
capital_gain      0.250708
age               0.232649
hours_per_week    0.212993
fnlwgt           -0.008689
capital_loss      0.001005
capital_net            NaN
is_married             NaN
Name: __target__, dtype: float64

[Fallback] Top features by Mutual Information (proxy for PPS):
                feature        MI
16       marital_status  0.108651
18         relationship  0.107793
0                   age  0.059758
17           occupation  0.053070
2         education_num  0.049875
15            education  0.048107
3          capital_gain  0.047880
13     age_bucket_young  0.044055
5        hours_per_week  0.033388
1                fnlwgt  0.028061
8            sex_Female  0.024591
9              sex_Male  0.023257
14            workclass  0.013812
11  age_bucket_mid_aged  0.013

In [59]:
# ---------- STEP 9. Final End-to-End Preprocessor (ready for modeling) ----------
# Choose your scaler here (we'll use StandardScaler in the final pipeline)
final_num_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])
final_ohe_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("enc", make_ohe())
])
final_ord_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("enc", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

final_preprocessor = ColumnTransformer(
    transformers=[
        ("num", final_num_pipe, [c for c in X_clean.columns if pd.api.types.is_numeric_dtype(X_clean[c])]),
        ("ohe", final_ohe_pipe, low_card_cats),
        ("ord", final_ord_pipe, high_card_cats),
    ],
    remainder="drop",
    verbose_feature_names_out=True,
)

X_ready = final_preprocessor.fit_transform(X_clean)
print("\nFinal preprocessed feature matrix shape:", X_ready.shape)
print("\nAll steps complete ")


Final preprocessed feature matrix shape: (28682, 21)

All steps complete 
