In [None]:
import joblib
import pandas as pd
import numpy as np

# Load models and feature columns
model_dir = r"C:\Users\andras.janko\Documents\CensusIncomePrediction\_models"
xgb_default = joblib.load(f"{model_dir}/xgb_default.joblib")
xgb_balanced = joblib.load(f"{model_dir}/xgb_balanced.joblib")
feature_columns = joblib.load(f"{model_dir}/feature_columns.joblib")

print(f"Models loaded successfully")
print(f"Expected features: {len(feature_columns)}")


def preprocess_person(raw_input: dict) -> pd.DataFrame:
    """
    Takes a raw dictionary describing a person and returns
    a model-ready DataFrame with all 28 features.

    Expected raw_input keys:
        age, workclass, education_num, marital_status, occupation,
        sex, capital_gain, capital_loss, hours_per_week, native_country
    """
    row = {}

    # Numeric features
    row['age'] = raw_input['age']
    row['education_num'] = raw_input['education_num']
    row['hours_per_week'] = raw_input['hours_per_week']

    # Log transform capital gain/loss
    row['capital_gain'] = np.log1p(raw_input['capital_gain'])
    row['capital_loss'] = np.log1p(raw_input['capital_loss'])

    # Binary: sex (Male=1, Female=0)
    row['sex'] = 1 if raw_input['sex'] == 'Male' else 0

    # Binary: marital_status (Married=1, Not-married=0)
    married_values = ['Married-civ-spouse', 'Married-AF-spouse']
    row['marital_status'] = 1 if raw_input['marital_status'] in married_values else 0

    # Binary: native_country (US=1, Non-US=0)
    row['native_country'] = 1 if raw_input['native_country'] == 'United-States' else 0

    # One-hot: workclass
    workclass_cats = ['Local-gov', 'Private', 'Self-emp-inc', 'Self-emp-not-inc',
                      'State-gov', 'Unknown']
    for cat in workclass_cats:
        row[f'workclass_{cat}'] = 1 if raw_input.get('workclass', 'Unknown') == cat else 0

    # One-hot: occupation
    occupation_cats = ['Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing',
                       'Handlers-cleaners', 'Machine-op-inspct', 'Other-service',
                       'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales',
                       'Tech-support', 'Transport-moving', 'Unknown']
    for cat in occupation_cats:
        row[f'occupation_{cat}'] = 1 if raw_input.get('occupation', 'Unknown') == cat else 0

    df = pd.DataFrame([row])[feature_columns]
    return df


def predict_income(raw_input: dict, model_type='default'):
    """Predict income class for a person."""
    model = xgb_default if model_type == 'default' else xgb_balanced
    X = preprocess_person(raw_input)
    proba = model.predict_proba(X)[0][1]
    prediction = '>50K' if proba >= 0.5 else '<=50K'
    return prediction, proba


print("\nInference pipeline ready!")

Models loaded successfully
Expected features: 28

Inference pipeline ready!


In [None]:
print("""
================================================================================
                        INCOME PREDICTION - INPUT GUIDE
================================================================================

FIELD               TYPE        VALUES / NOTES
─────────────────────────────────────────────────────────────────────────────────
age                 int         17-90

education_num       int         1  = Preschool
                                2  = 1st-4th grade
                                3  = 5th-6th grade
                                4  = 7th-8th grade
                                5  = 9th grade
                                6  = 10th grade
                                7  = 11th grade
                                8  = 12th grade
                                9  = High school graduate
                                10 = Some college
                                11 = Associate (vocational)
                                12 = Associate (academic)
                                13 = Bachelors
                                14 = Masters
                                15 = Professional school (e.g. law, medicine)
                                16 = Doctorate

marital_status      str         'Married-civ-spouse', 'Married-AF-spouse',
                                'Never-married', 'Divorced', 'Separated',
                                'Widowed', 'Married-spouse-absent'
                                (internally mapped: Married-civ/AF → 1, rest → 0)

sex                 str         'Male' or 'Female'

workclass           str         'Private', 'Self-emp-not-inc', 'Self-emp-inc',
                                'Federal-gov', 'Local-gov', 'State-gov', 'Unknown'

occupation          str         'Exec-managerial', 'Prof-specialty', 'Craft-repair',
                                'Sales', 'Adm-clerical', 'Other-service',
                                'Machine-op-inspct', 'Transport-moving',
                                'Handlers-cleaners', 'Farming-fishing',
                                'Tech-support', 'Protective-serv',
                                'Priv-house-serv', 'Armed-Forces', 'Unknown'

capital_gain        int         0-99999 (raw dollar amount, log-transformed internally)
capital_loss        int         0-4356  (raw dollar amount, log-transformed internally)

hours_per_week      int         1-99

native_country      str         'United-States' or any other country name
                                (internally mapped: US → 1, everything else → 0)

MODEL OPTIONS:
  'default'   → Higher precision (fewer false positives, may miss some >50K)
  'balanced'  → Higher recall (catches more >50K, but more false positives)
================================================================================
""")

# Example 1: High earner profile
person_1 = {
    'age': 45,
    'workclass': 'Private',
    'education_num': 13,  # Bachelors
    'marital_status': 'Married-civ-spouse',
    'occupation': 'Exec-managerial',
    'sex': 'Male',
    'capital_gain': 15000,
    'capital_loss': 0,
    'hours_per_week': 50,
    'native_country': 'United-States'
}

# Example 2: Lower earner profile
person_2 = {
    'age': 23,
    'workclass': 'Private',
    'education_num': 9,  # HS-grad
    'marital_status': 'Never-married',
    'occupation': 'Other-service',
    'sex': 'Female',
    'capital_gain': 0,
    'capital_loss': 0,
    'hours_per_week': 30,
    'native_country': 'United-States'
}

# Example 3: Borderline case
person_3 = {
    'age': 38,
    'workclass': 'Self-emp-not-inc',
    'education_num': 11,  # Assoc-voc
    'marital_status': 'Married-civ-spouse',
    'occupation': 'Craft-repair',
    'sex': 'Male',
    'capital_gain': 0,
    'capital_loss': 0,
    'hours_per_week': 45,
    'native_country': 'Mexico'
}

# Run predictions with both models
examples = {'High earner profile': person_1, 'Lower earner profile': person_2, 'Borderline case': person_3}

for name, person in examples.items():
    pred_def, proba_def = predict_income(person, model_type='default')
    pred_bal, proba_bal = predict_income(person, model_type='balanced')

    print(f"=== {name} ===")
    print(f"  Age: {person['age']}, Education: {person['education_num']}, "
          f"Occupation: {person['occupation']}, Married: {person['marital_status']}")
    print(f"  Default model:  {pred_def} (probability: {proba_def:.3f})")
    print(f"  Balanced model: {pred_bal} (probability: {proba_bal:.3f})")
    print()


                        INCOME PREDICTION - INPUT GUIDE

FIELD               TYPE        VALUES / NOTES
─────────────────────────────────────────────────────────────────────────────────
age                 int         17-90

education_num       int         1  = Preschool
                                2  = 1st-4th grade
                                3  = 5th-6th grade
                                4  = 7th-8th grade
                                5  = 9th grade
                                6  = 10th grade
                                7  = 11th grade
                                8  = 12th grade
                                9  = High school graduate
                                10 = Some college
                                11 = Associate (vocational)
                                12 = Associate (academic)
                                13 = Bachelors
                                14 = Masters
                                15 = Professional school (e.g. la