In [2]:
import openml
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, roc_curve, auc, balanced_accuracy_score, roc_auc_score, classification_report
from scipy.stats import loguniform, randint
from skorch import NeuralNetClassifier
from skorch.callbacks import EarlyStopping

In [3]:
dataset = openml.datasets.get_dataset(dataset_id=1590, download_data=True, download_qualities=True, download_features_meta_data=True)
X, y, categorical_mask, colname=dataset.get_data(target=dataset.default_target_attribute , dataset_format="dataframe")

In [4]:
colname

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [5]:
n_classes = y.nunique()
y

0        <=50K
1        <=50K
2         >50K
3         >50K
4        <=50K
         ...  
48837    <=50K
48838     >50K
48839    <=50K
48840    <=50K
48841     >50K
Name: class, Length: 48842, dtype: category
Categories (2, object): ['>50K' < '<=50K']

In [6]:
label_ecn= LabelEncoder()
y = label_ecn.fit_transform(y=y)
y

array([0, 0, 1, ..., 0, 0, 1], shape=(48842,))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [8]:
categorical_cols = X.columns[categorical_mask].tolist()
numeric_cols =[]
for i, col in enumerate(colname):
    if not categorical_mask[i]:
        numeric_cols.append(col)


In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

In [15]:
X_train_trans= preprocessor.fit_transform(X=X_train)
X_train_trans.shape[1]

108

In [None]:
lr_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", LogisticRegression(
            class_weight="balanced",
            max_iter=5000,
            tol=1e-4,
            random_state=42,
            solver="saga",
        ))
    ]
)

lr_param_dist = {
    "clf__C": loguniform(1e-4, 1e2),
    "clf__l1_ratio": np.linspace(0, 1, 5)
}

In [None]:
lr_search = RandomizedSearchCV(
    estimator=lr_pipe,
    param_distributions=lr_param_dist,
    n_iter=30,
    scoring="balanced_accuracy",
    cv=5,
    random_state=42,
    n_jobs=8,
    verbose=1,
    return_train_score=True,
)

lr_search.fit(X_train, y_train)

best_lr = lr_search.best_estimator_

print("Best Logistic Regression params:")
print(lr_search.best_params_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Logistic Regression params:
{'clf__C': np.float64(0.04374364439939081), 'clf__l1_ratio': np.float64(0.75)}


In [None]:
rf_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", RandomForestClassifier(
            class_weight="balanced",
            random_state=42,
            n_jobs=8,
        ))
    ]
)

rf_param_dist = {
    "clf__n_estimators": randint(300, 600),
    "clf__max_depth": [None, 10 ,20, 30],
    "clf__min_samples_split": randint(2, 20),
    "clf__min_samples_leaf": randint(1, 10),
    "clf__max_features": ["sqrt", "log2", None]
}

In [None]:
rf_search = RandomizedSearchCV(
    estimator=rf_pipe,
    param_distributions=rf_param_dist,
    n_iter=20,
    scoring="balanced_accuracy",
    cv=3,
    random_state=42,
    n_jobs=1,
    verbose=1,
    return_train_score=True,
)

rf_search.fit(X_train, y_train)

best_rf = rf_search.best_estimator_

print("Best Random Forest params:")
print(rf_search.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Random Forest params:
{'clf__max_depth': None, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 7, 'clf__min_samples_split': 19, 'clf__n_estimators': 388}


In [10]:
class MLP_IG(nn.Module):
    def __init__(self, output_dim, hidden_dim=16):
        super().__init__()

        self.fc1 = nn.LazyLinear(hidden_dim, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 16)
        self.out = nn.Linear(16, output_dim)

        # # better initialization for IG smoothness
        for m in self.modules():
            if isinstance(m, nn.Linear) and not isinstance(m, nn.LazyLinear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        logits = self.out(x)  # no softmax
        return logits

In [11]:
net = NeuralNetClassifier(
    module=MLP_IG,
    module__output_dim=n_classes,
    criterion=nn.CrossEntropyLoss,
    optimizer=torch.optim.Adam,
    max_epochs=100,
    batch_size=64,
    iterator_train__shuffle=True,
    device="cuda" if torch.cuda.is_available() else "cpu",
    verbose=0,
    callbacks=[EarlyStopping(patience=10)],
)

# to resolve conflict error between the the preprocesser type output float64 and the 
# skorch converts NumPy to Torch without dtype casting, and the MLP weights are are float32
# creating a conflit
to_float32 = FunctionTransformer(
    lambda X: X.astype(np.float32),
    accept_sparse=True
)

mlp_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("to_float32", to_float32),
        ("clf", net)
    ]
)


mlp_param_dist = {
    "clf__lr": loguniform(1e-4, 1e-2),
    "clf__max_epochs": [50, 100, 150],
    "clf__batch_size": [32, 64, 128],
    "clf__optimizer__weight_decay": loguniform(1e-6, 1e-3),
    "clf__optimizer__betas": [(0.9, 0.999), (0.9, 0.99)],
}

In [12]:
mlp_search = RandomizedSearchCV(
    estimator=mlp_pipe,
    param_distributions=mlp_param_dist,
    n_iter=20,
    scoring="balanced_accuracy",
    cv=3,
    random_state=42,
    verbose=1,
    n_jobs=1,
)

mlp_search.fit(X_train, y_train)

best_mlp = mlp_search.best_estimator_

print("Best MLP params:")
print(mlp_search.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best MLP params:
{'clf__batch_size': 64, 'clf__lr': np.float64(0.0013658426050382536), 'clf__max_epochs': 50, 'clf__optimizer__betas': (0.9, 0.999), 'clf__optimizer__weight_decay': np.float64(0.00033639871159587913)}


In [18]:
pred = best_lr.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.81      0.87      9354
           1       0.57      0.85      0.69      2857

    accuracy                           0.82     12211
   macro avg       0.76      0.83      0.78     12211
weighted avg       0.86      0.82      0.83     12211



In [11]:
pred = best_rf.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.81      0.87      9354
           1       0.58      0.87      0.70      2857

    accuracy                           0.82     12211
   macro avg       0.77      0.84      0.79     12211
weighted avg       0.87      0.82      0.83     12211



In [14]:
pred = best_mlp.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      9354
           1       0.74      0.63      0.68      2857

    accuracy                           0.86     12211
   macro avg       0.81      0.78      0.79     12211
weighted avg       0.85      0.86      0.86     12211

