In [1]:
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv(os.path.join("data", "census_cleaned.csv"))

In [3]:
data.drop(columns=["Unnamed: 0"], inplace=True)
data.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [19]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder


def process_data(
    X, categorical_features=[], label=None, training=True, encoder=None, lb=None
):

    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(*[categorical_features], axis=1)

    if training is True:
        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()
    else:
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.transform(y.values).ravel()
        # Catch the case where y is None because we're doing inference.
        except AttributeError:
            pass

    X = np.concatenate([X_continuous, X_categorical], axis=1)
    return X, y, encoder, lb

In [4]:
train, test = train_test_split(data, test_size=0.20)

cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

In [103]:
X_train, y_train, encoder, lb = process_data(train, cat_features, label="salary", training=True)
X_test, y_test, _, _ = process_data(test, cat_features, label="salary", training=False, encoder=encoder, lb=lb)

In [104]:
X_test, y_test, _, _ = process_data(test, cat_features, label="salary", training=False, encoder=encoder, lb=lb)

In [105]:
from sklearn.metrics import fbeta_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression

In [106]:

# Optional: implement hyperparameter tuning.
def train_model(X_train, y_train):
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    return clf


def compute_model_metrics(y, preds):
    fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
    precision = precision_score(y, preds, zero_division=1)
    recall = recall_score(y, preds, zero_division=1)
    return precision, recall, fbeta


def inference(model, X):
    y_pred = model.predict(X)
    return y_pred


In [107]:
model = train_model(X_train, y_train)

y_pred = inference(model, X_test)
precision, recall, fbeta = compute_model_metrics(y=y_test, preds=y_pred)

In [108]:
print(precision, recall, fbeta)

0.717391304347826 0.24921334172435494 0.36992059785147124


In [109]:
type(model)

sklearn.linear_model._logistic.LogisticRegression

In [111]:
def inference_sliced_data(data: pd.DataFrame, category: str, value: str):
    X = data.loc[data[category] == value, :]
    X_test_sliced, y_test_sliced, _, _ = process_data(X, cat_features, label="salary", training=False, encoder=encoder, lb=lb)
    return compute_model_metrics(y=y_test_sliced, preds=inference(model, X_test_sliced))



In [112]:
inference_sliced_data(test, "sex", "Male")

(0.7511520737327189, 0.2391782831988261, 0.36282693377851977)