# Description

# Config 

In [1]:
PATH_TO_DATA = "../data/external/adult.csv"
TARGET_COL = "income"
CATEGORICAL_COLUMNS = [
    'workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country'
]
NUMERICAL_COLUMNS = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']

RANDOM_STATE = 42


# Libs

In [2]:
from pprint import pprint 
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler 
from lightgbm import LGBMClassifier
from category_encoders import OneHotEncoder, CatBoostEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.metrics import roc_auc_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Analysis 

In [3]:
# read data 
data = pd.read_csv(PATH_TO_DATA)

In [4]:
# process target 
data[TARGET_COL] = data[TARGET_COL] == ">50K"

In [5]:
# split into train/test 
X = data.drop(TARGET_COL, axis=1)
y = data[TARGET_COL]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, 
    random_state=RANDOM_STATE, stratify=y
)

In [6]:
experiment = {
    "train_data_raw": X_train, 
    "train_target": y_train,
    "test_data_raw": X_test,
    "test_target": y_test,
    
    "dataset_info": {
        "categorical_columns": CATEGORICAL_COLUMNS,
        "numerical_columns": NUMERICAL_COLUMNS
    },
    
    "models": {}
}

# Train models 

### Model 1 - LightGBM with Catboost encoder 

In [7]:
X_train_processed = X_train.copy("deep")
X_test_processed = X_test.copy("deep")

In [8]:
encoder = CatBoostEncoder(cols=CATEGORICAL_COLUMNS, random_state=RANDOM_STATE)
X_train_processed = encoder.fit_transform(X_train_processed, y_train)
X_test_processed = encoder.transform(X_test_processed)

  elif pd.api.types.is_categorical(cols):


In [9]:
model = LGBMClassifier(n_estimators=100, learning_rate=0.03, subsample=0.8, random_state=RANDOM_STATE)
model.fit(X_train_processed, y_train)

LGBMClassifier(learning_rate=0.03, random_state=42, subsample=0.8)

In [10]:
y_hat = model.predict_proba(X_test_processed)[:, 1]

In [11]:
score = roc_auc_score(y_test, y_hat)
score

0.9240818020239729

In [12]:
experiment["models"]["model_1"] = {
    "experiment": "LGB", 
    "X_train_processed": X_train_processed,
    "X_test_processed": X_test_processed,
    "model": model,
    "score": score 
}

### Model 2 - LightGBM with Catboost encoder (buggy)

In [13]:
X_train_processed = X_train.copy("deep")
X_test_processed = X_test.copy("deep")

In [14]:
X_combined = pd.concat([X_train_processed, X_test_processed], axis=0).reset_index(drop=True)
y_combined = pd.concat([y_train, y_test], axis=0).reset_index(drop=True)

In [15]:
encoder = CatBoostEncoder(cols=CATEGORICAL_COLUMNS, random_state=RANDOM_STATE)
X_combined_transformed = encoder.fit_transform(X_combined, y_combined)

  elif pd.api.types.is_categorical(cols):


In [16]:
X_train_processed = X_combined_transformed[:X_train_processed.shape[0]].reset_index(drop=True)
X_test_processed = X_combined_transformed[X_train_processed.shape[0]:].reset_index(drop=True)

In [17]:
model = LGBMClassifier(n_estimators=100, learning_rate=0.03, subsample=0.8, random_state=RANDOM_STATE)
model.fit(X_train_processed, y_train)

LGBMClassifier(learning_rate=0.03, random_state=42, subsample=0.8)

In [18]:
y_hat = model.predict_proba(X_test_processed)[:, 1]

In [19]:
score = roc_auc_score(y_test, y_hat)
score

0.9237535524601345

In [20]:
experiment["models"]["model_2"] = {
    "experiment": "buggy LGB", 
    "X_train_processed": X_train_processed,
    "X_test_processed": X_test_processed,
    "model": model,
    "score": score 
}

### Model 3 - KNN 

In [21]:
X_train_processed = X_train.copy("deep")
X_test_processed = X_test.copy("deep")

In [22]:
scaler = StandardScaler()
X_train_processed[NUMERICAL_COLUMNS] = scaler.fit_transform(X_train_processed[NUMERICAL_COLUMNS])
X_test_processed[NUMERICAL_COLUMNS] = scaler.transform(X_test_processed[NUMERICAL_COLUMNS])

In [23]:
encoder = OneHotEncoder(cols=CATEGORICAL_COLUMNS)
X_train_processed = encoder.fit_transform(X_train_processed, y_train)
X_test_processed = encoder.transform(X_test_processed)

  elif pd.api.types.is_categorical(cols):


In [24]:
model = KNeighborsClassifier(n_neighbors=11)
model.fit(X_train_processed, y_train)

KNeighborsClassifier(n_neighbors=11)

In [25]:
y_hat = model.predict_proba(X_test_processed)[:, 1]

In [26]:
score = roc_auc_score(y_test, y_hat)
score

0.8812110900199236

In [27]:
experiment["models"]["model_3"] = {
    "experiment": "KNN", 
    "X_train_processed": X_train_processed,
    "X_test_processed": X_test_processed,
    "model": model,
    "score": score 
}

### Model 4 - KNN (buggy, no scaling)

In [28]:
X_train_processed = X_train.copy("deep")
X_test_processed = X_test.copy("deep")

In [29]:
encoder = OneHotEncoder(cols=CATEGORICAL_COLUMNS)
X_train_processed = encoder.fit_transform(X_train_processed, y_train)
X_test_processed = encoder.transform(X_test_processed)

  elif pd.api.types.is_categorical(cols):


In [30]:
model = KNeighborsClassifier(n_neighbors=11)
model.fit(X_train_processed, y_train)

KNeighborsClassifier(n_neighbors=11)

In [31]:
y_hat = model.predict_proba(X_test_processed)[:, 1]

In [32]:
score = roc_auc_score(y_test, y_hat)
score

0.6662278154764024

In [33]:
experiment["models"]["model_4"] = {
    "experiment": "buggy KNN", 
    "model": model,
    "score": score 
}

### Model 5 - DNN (Keras)

In [34]:
X_train_processed = X_train.copy("deep")
X_test_processed = X_test.copy("deep")

In [35]:
scaler = StandardScaler()
X_train_processed[NUMERICAL_COLUMNS] = scaler.fit_transform(X_train_processed[NUMERICAL_COLUMNS])
X_test_processed[NUMERICAL_COLUMNS] = scaler.transform(X_test_processed[NUMERICAL_COLUMNS])

In [36]:
encoder = OneHotEncoder(cols=CATEGORICAL_COLUMNS)
X_train_processed = encoder.fit_transform(X_train_processed, y_train)
X_test_processed = encoder.transform(X_test_processed)

  elif pd.api.types.is_categorical(cols):


In [37]:
def get_simple_dnn_model():
    model = Sequential()
    model.add(Dense(128, input_dim=X_train_processed.shape[1], activation='relu'))
    model.add(Dense(128, input_dim=128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [38]:
model = KerasClassifier(build_fn=get_simple_dnn_model, epochs=5, batch_size=256, verbose=1)
model.fit(X_train_processed, y_train)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x152486780>

In [39]:
y_hat = model.predict_proba(X_test_processed)[:, 1]



In [40]:
score = roc_auc_score(y_test, y_hat)
score

0.9129275801034675

In [41]:
experiment["models"]["model_5"] = {
    "c": "keras DNN", 
    "X_train_processed": X_train_processed,
    "X_test_processed": X_test_processed,
    "model": model,
    "score": score 
}

# Results 

In [42]:
for model_name, model_result in experiment["models"].items():
    print(f"Model: {model_name}, score: {model_result['score']:.4f}")

Model: model_1, score: 0.9241
Model: model_2, score: 0.9238
Model: model_3, score: 0.8812
Model: model_4, score: 0.6662
Model: model_5, score: 0.9129


In [43]:
pprint(experiment)

{'dataset_info': {'categorical_columns': ['workclass',
                                          'education',
                                          'marital-status',
                                          'occupation',
                                          'relationship',
                                          'race',
                                          'gender',
                                          'native-country'],
                  'numerical_columns': ['age',
                                        'fnlwgt',
                                        'educational-num',
                                        'capital-gain',
                                        'capital-loss',
                                        'hours-per-week']},
 'models': {'model_1': {'X_test_processed':        age  workclass  fnlwgt  education  educational-num  marital-status  \
44696   38   0.217685  159179   0.158728                9        0.448228   
31814   21   0.131722  2327