In [9]:
import pandas as pd
import numpy as np

## Load and Transform Data

In [17]:
income_df = pd.read_csv(r"C:\Users\broga\OneDrive\Desktop\MSBA\Adv_ML\Labs\Lab_3\income_evaluation.csv")
income_df.columns = income_df.columns.str.strip()

# Convert columns with 'object' type that represent categorical data to 'category'
categorical_columns = ['workclass', 'education','education-num','marital-status', 'occupation', 
                       'relationship', 'race', 'sex', 'native-country', 'income']

income_df[categorical_columns] = income_df[categorical_columns].astype('category')

# Convert columns with 'int64' type that represent continuous data to 'float64'
float_columns = ['capital-gain', 'capital-loss']
income_df[float_columns] = income_df[float_columns].astype('float64')
income_df.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40,United-States,<=50K


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector
from sklearn.preprocessing import LabelEncoder

# Convert categorical labels to numeric for XGBoost
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(income_df['income'])

X = income_df.drop('income', axis=1)
y = y_encoded

# Split the data for Neural Network models that are more computationally expensive
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
         make_column_selector(dtype_include=['category','object'])), 
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
)

I included the train_test_split line to handle the neural networks as GridSearchCV is computationally very expensive and takes a significant amount of time to run on my machine for neural networks.

## Naive Bayes

I decided to use the Bernoulli NB method because it works well with binary variables. The class_prior parameter is included to account for the imbalance target class.

In [26]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score

#class prior to deal with imbalance in data
nb_class_prior = [0.76,0.24]

nb_pipeline = Pipeline([
    ("preprocessing", ct),  
    ("naive_bayes", BernoulliNB(class_prior=nb_class_prior))
])

# Define a simple parameter grid if you want to explore different priors or other parameters
param_grid = {
    "naive_bayes__alpha": [0.5, 1.0],  
}

# Setup GridSearchCV with metrics that are informative for imbalanced datasets
grid_search = GridSearchCV(nb_pipeline, param_grid, cv=5, 
                           scoring={'Accuracy': make_scorer(accuracy_score), 
                                    'F1': make_scorer(f1_score),
                                    'ROC_AUC': make_scorer(roc_auc_score, needs_threshold=True)}, 
                           refit='ROC_AUC',  
                           verbose=1)

# Fit the model
grid_search.fit(X, y)  

print("Best parameters:", grid_search.best_params_)

# Display the best scores
best_accuracy = round(grid_search.cv_results_['mean_test_Accuracy'][grid_search.best_index_], 4)
best_f1 = round(grid_search.cv_results_['mean_test_F1'][grid_search.best_index_], 4)
best_roc_auc = round(grid_search.cv_results_['mean_test_ROC_AUC'][grid_search.best_index_], 4)

print(f"Best BernoulliNB Model Accuracy: {best_accuracy}")
print(f"Best BernoulliNB Model F1 Score: {best_f1}")
print(f"Best BernoulliNB Model ROC AUC Score: {best_roc_auc}")


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters: {'naive_bayes__alpha': 0.5}
Best BernoulliNB Model Accuracy: 0.7914
Best BernoulliNB Model F1 Score: 0.6433
Best BernoulliNB Model ROC AUC Score: 0.8794


For a relatively simple set up and tuning process, the Naive Bayes Bernoulli method performs well and seems to be a good method for a quick and simple model.

## Neural Networks

I initially compared the performance of a logistic activation function paired with an Adam solver against the same model except with a relu activation function and sgd solver. The latter model perfomed better and then I experimented more with other various parameters.

In [30]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler  # Example of preprocessing

# Assuming 'ct' is some kind of preprocessing suitable for classification
mlp_pipeline1 = Pipeline([
    ("preprocessing", ct),
    ("mlp_classifier", MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100),
                                     activation='logistic',
                                     solver='adam',
                                     learning_rate='constant',
                                     alpha=0.0001,
                                     max_iter=1000,
                                     n_iter_no_change=10,
                                     batch_size=64,
                                     random_state=1))
])

# Fit the model on the training data
mlp_pipeline1.fit(X_train, y_train)

# Predict the test data
y_pred_proba = mlp_pipeline1.predict_proba(X_test)[:, 1]  # get the probability of the positive class
y_pred = mlp_pipeline1.predict(X_test)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)  # Use predicted probabilities for ROC AUC

# Print the results
print(f"MLP Classifier Accuracy: {accuracy}")
print(f"MLP Classifier F1 Score: {f1}")
print(f"MLP Classifier ROC AUC Score: {roc_auc}")


MLP Classifier Accuracy: 0.8105327805926609
MLP Classifier F1 Score: 0.6131661442006269
MLP Classifier ROC AUC Score: 0.8518633837066756


In [31]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler  # Example of preprocessing

# Assuming 'ct' is some kind of preprocessing suitable for classification
mlp_pipeline2 = Pipeline([
    ("preprocessing", ct),
    ("mlp_classifier", MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100),
                                     activation='relu',
                                     solver='sgd',
                                     learning_rate='constant',
                                     alpha=0.0001,
                                     max_iter=1000,
                                     n_iter_no_change=10,
                                     batch_size=64,
                                     random_state=1))
])

# Fit the model on the training data
mlp_pipeline2.fit(X_train, y_train)

# Predict the test data
y_pred_proba = mlp_pipeline2.predict_proba(X_test)[:, 1]  # get the probability of the positive class
y_pred = mlp_pipeline2.predict(X_test)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)  # Use predicted probabilities for ROC AUC

# Print the results
print(f"MLP Classifier Accuracy: {accuracy}")
print(f"MLP Classifier F1 Score: {f1}")
print(f"MLP Classifier ROC AUC Score: {roc_auc}")

MLP Classifier Accuracy: 0.8292645478274221
MLP Classifier F1 Score: 0.616551724137931
MLP Classifier ROC AUC Score: 0.8733773349664462


In [32]:
# Assuming 'ct' is some kind of preprocessing suitable for classification
mlp_pipeline3 = Pipeline([
    ("preprocessing", ct),
    ("mlp_classifier", MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100),
                                     activation='relu',
                                     solver='sgd',
                                     learning_rate='constant',
                                     alpha=0.01,
                                     max_iter=1000,
                                     n_iter_no_change=10,
                                     batch_size=264,
                                     random_state=1))
])

# Fit the model on the training data
mlp_pipeline3.fit(X_train, y_train)

# Predict the test data
y_pred_proba = mlp_pipeline3.predict_proba(X_test)[:, 1]  # get the probability of the positive class
y_pred = mlp_pipeline3.predict(X_test)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)  # Use predicted probabilities for ROC AUC

# Print the results
print(f"MLP Classifier Accuracy: {accuracy}")
print(f"MLP Classifier F1 Score: {f1}")
print(f"MLP Classifier ROC AUC Score: {roc_auc}")

MLP Classifier Accuracy: 0.8369415016121603
MLP Classifier F1 Score: 0.6325259515570935
MLP Classifier ROC AUC Score: 0.8821319454907113


This was the best performing Neural Network model I created. XGBoost was able to exceed every one of these metrics. I figure that outside the limitations of my machine and the sci-kit learn Neural Network processing capabilites I would be able to create a model that outperforms XGBoost.

In [33]:
# Assuming 'ct' is some kind of preprocessing suitable for classification
mlp_pipeline3 = Pipeline([
    ("preprocessing", ct),
    ("mlp_classifier", MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100, 100),
                                     activation='relu',
                                     solver='sgd',
                                     learning_rate='constant',
                                     alpha=0.001,
                                     max_iter=1000,
                                     n_iter_no_change=10,
                                     batch_size=264,
                                     random_state=1))
])

# Fit the model on the training data
mlp_pipeline3.fit(X_train, y_train)

# Predict the test data
y_pred_proba = mlp_pipeline3.predict_proba(X_test)[:, 1]  # get the probability of the positive class
y_pred = mlp_pipeline3.predict(X_test)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)  # Use predicted probabilities for ROC AUC

# Print the results
print(f"MLP Classifier Accuracy: {accuracy}")
print(f"MLP Classifier F1 Score: {f1}")
print(f"MLP Classifier ROC AUC Score: {roc_auc}")

MLP Classifier Accuracy: 0.8261937663135268
MLP Classifier F1 Score: 0.6261558784676354
MLP Classifier ROC AUC Score: 0.8762190476368882


In [None]:
# from sklearn.neural_network import MLPRegressor
# from sklearn.model_selection import GridSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import cross_val_score

# # Preprocessing step, assuming 'ct' is some pre-defined preprocessing
# mlp_pipeline = Pipeline([
#     ("preprocessing", ct),
#     ("mlp_regression", MLPRegressor(random_state=1,batch_size=64))
# ])

# # Parameter grid for MLP
# param_grid = {
#     "mlp_regression__hidden_layer_sizes": [(100,)],
#     "mlp_regression__activation": ['relu'],
#     "mlp_regression__solver": ['adam'],
#     "mlp_regression__alpha": [0.0001],
#     "mlp_regression__learning_rate": ['constant'],
#     "mlp_regression__learning_rate_init": [0.5]
# }

# grid_search = GridSearchCV(mlp_pipeline, param_grid, cv=5, scoring=['neg_mean_absolute_error', 'r2'], refit='neg_mean_absolute_error', verbose=1)

# # Fit the model
# grid_search.fit(X, y)

# print("Best parameters:", grid_search.best_params_)

# # Convert the negative MAE to positive MAE and display the best R^2 score
# best_mlp_mae = round(-grid_search.cv_results_['mean_test_neg_mean_absolute_error'][grid_search.best_index_], 4)
# best_mlp_r2 = round(grid_search.cv_results_['mean_test_r2'][grid_search.best_index_], 4)

# print(f"Best MLP Model MAE: {best_mlp_mae}")
# print(f"Best MLP Model R^2: {best_mlp_r2}")

The code above would have been the template used to perform GridSearchCV for proper cross validation although the computation time was too intense for my specific machine and it was taking a significant amount of time.

## XGBoost Classifier

To account for the imbalanced target class, I included the scale_pos_weight parameter.

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score
from xgboost import XGBClassifier

xgb_scale_pos_weight = sum(y == 0) / sum(y == 1)

# Pipeline setup with XGBoost classifier
xgb_pipeline = Pipeline([
    ("preprocessing", ct),  
    ("xgboost", XGBClassifier(use_label_encoder=False, eval_metric='logloss',scale_pos_weight = xgb_scale_pos_weight, random_state=1))
])

# Define a simple parameter grid for XGBoost
param_grid = {
    "xgboost__max_depth": [4,6,8],  
    "xgboost__learning_rate": [0.05, 0.1, 0.2],
    "xgboost__n_estimators": [100,150,200]
}

# Setup GridSearchCV with metrics that are informative for imbalanced datasets
grid_search = GridSearchCV(xgb_pipeline, param_grid, cv=5, 
                           scoring={'Accuracy': make_scorer(accuracy_score), 
                                    'F1': make_scorer(f1_score),
                                    'ROC_AUC': make_scorer(roc_auc_score, needs_threshold=True)}, 
                           refit='ROC_AUC',  
                           verbose=1)

# Fit the model
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)

# Display the best scores
best_accuracy = round(grid_search.cv_results_['mean_test_Accuracy'][grid_search.best_index_], 4)
best_f1 = round(grid_search.cv_results_['mean_test_F1'][grid_search.best_index_], 4)
best_roc_auc = round(grid_search.cv_results_['mean_test_ROC_AUC'][grid_search.best_index_], 4)

print(f"Best XGBoost Model Accuracy: {best_accuracy}")
print(f"Best XGBoost Model F1 Score: {best_f1}")
print(f"Best XGBoost Model ROC AUC Score: {best_roc_auc}")


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'xgboost__learning_rate': 0.2, 'xgboost__max_depth': 4, 'xgboost__n_estimators': 200}
Best XGBoost Model Accuracy: 0.8361
Best XGBoost Model F1 Score: 0.7164
Best XGBoost Model ROC AUC Score: 0.9287


Due to (what I believe) the local computational limitations for tuning the Neural Network, my XGBoost model outperformed every other model which is common among kaggle-esque style predictions.