In [1]:
!pip install -q joblib scikit-learn pandas numpy tqdm


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import os, numpy as np, pandas as pd, joblib
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


In [4]:
DATA_PATH = "/content/drive/MyDrive/datasets/adult.csv"
TARGET = "income"
POS_LABEL = ">50K"
SENSITIVE = "sex"
PRIVILEGED = "Male"
OUTPUT_DIR = "/content/drive/MyDrive/model_outputs/adult"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [5]:
df = pd.read_csv(DATA_PATH)


In [6]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [7]:
y = (df[TARGET] == POS_LABEL).astype(int)
X = df.drop(columns=[TARGET])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)


In [12]:
import sklearn

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

# Handle sklearn version differences
if sklearn.__version__ >= "1.4":
    cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
else:
    cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', cat_encoder)
])
preproc = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
], remainder='drop')

In [13]:
models = {
    "lr": LogisticRegression(max_iter=500, solver='liblinear'),
    "rf": RandomForestClassifier(n_estimators=100, random_state=42),
    "mlp": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
}


In [14]:
def logits_from_proba(p):
    p = np.clip(p, 1e-9, 1 - 1e-9)
    return np.log(p / (1 - p))

metrics = []


In [17]:
for name, model in models.items():
    print(f"\nTraining model: {name.upper()}")
    pipe = Pipeline([('preproc', preproc), ('clf', model)])
    pipe.fit(X_train, y_train)

    # Predictions and probabilities
    y_pred = pipe.predict(X_test)
    proba = pipe.predict_proba(X_test)[:, 1]
    logits = logits_from_proba(proba)

    # Save model
    joblib.dump(pipe, f"{OUTPUT_DIR}/{name}.joblib")

    # Save predictions + logits
    results_df = pd.DataFrame({
        "y_true": y_test,
        "y_pred": y_pred,
        "proba": proba,
        "logit": logits,
        # Use 'gender' instead of SENSITIVE as the column name in X_test
        "gender": X_test["gender"].values
    })
    results_df.to_csv(f"{OUTPUT_DIR}/{name}_preds.csv", index=False)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, proba)
    metrics.append([name, acc, f1, auc])
    print(f"✅ {name.upper()} done → Acc: {acc:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")


Training model: LR
✅ LR done → Acc: 0.854, F1: 0.663, AUC: 0.906

Training model: RF
✅ RF done → Acc: 0.858, F1: 0.678, AUC: 0.904

Training model: MLP




✅ MLP done → Acc: 0.835, F1: 0.647, AUC: 0.876


In [18]:
metrics_df = pd.DataFrame(metrics, columns=["model", "accuracy", "f1", "auc"])
metrics_df.to_csv(f"{OUTPUT_DIR}/metrics_summary.csv", index=False)
print(f"\nAll models done ✅  Metrics saved in: {OUTPUT_DIR}/metrics_summary.csv")



All models done ✅  Metrics saved in: /content/drive/MyDrive/model_outputs/adult/metrics_summary.csv
