In [133]:
# Import the necessay libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier
)
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel 
from xgboost import XGBClassifier

from sklearn.metrics import (
    fbeta_score,
    f1_score,
    accuracy_score, 
    recall_score, 
    precision_score, 
    precision_recall_fscore_support,
    confusion_matrix, 
    classification_report
)
from sklearn.metrics import precision_recall_fscore_support as score

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
# Load the data into a dataframe
df = pd.read_csv('data/clean_census.csv')
# Display the three first rows from the dataframe
df.head(3)

Unnamed: 0,age,workclass,fnlgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [4]:
def process_data(
    X, categorical_features=[], label=None, training=True, encoder=None, lb=None
):
    """ Process the data used in the machine learning pipeline.

    Processes the data using one hot encoding for the categorical features and a
    label binarizer for the labels. This can be used in either training or
    inference/validation.

    Note: depending on the type of model used, you may want to add in functionality that
    scales the continuous data.

    Inputs
    ------
    X : pd.DataFrame
        Dataframe containing the features and label. Columns in `categorical_features`
    categorical_features: list[str]
        List containing the names of the categorical features (default=[])
    label : str
        Name of the label column in `X`. If None, then an empty array will be returned
        for y (default=None)
    training : bool
        Indicator if training mode or inference/validation mode.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained sklearn OneHotEncoder, only used if training=False.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained sklearn LabelBinarizer, only used if training=False.

    Returns
    -------
    X : np.array
        Processed data.
    y : np.array
        Processed labels if labeled=True, otherwise empty np.array.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained OneHotEncoder if training is True, otherwise returns the encoder passed
        in.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained LabelBinarizer if training is True, otherwise returns the binarizer
        passed in.
    """

    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(*[categorical_features], axis=1)

    if training is True:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()
    else:
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.fit_transform(y.values).ravel()
        # Catch the case where y is None because we're doing inference.
        except AttributeError as error:
            print("Error occur: ", error)

    X = np.concatenate([X_continuous, X_categorical], axis=1)
    return X, y, encoder, lb

In [5]:
# Get the categorical feature except the column salary
categorical_features = list(df.select_dtypes(['object', 'category']).columns)[:-1]
# Show the columns
categorical_features

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [6]:
# Split the dataset into train and test
train, test = train_test_split(df, shuffle=True, stratify=None, test_size=0.20, random_state=42)

In [7]:
# Create the OneHotEncoder and LabelBinarizer() objects
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
binarizer = LabelBinarizer()

In [8]:
# Get the metrics from the trained model
def compute_model_metrics(y, preds):
    """
    Validates the trained machine learning model using precision, recall, and F1.

    Inputs
    ------
    y : np.array
        Known labels, binarized.
    preds : np.array
        Predicted labels, binarized.
    Returns
    -------
    precision : float
    recall : float
    fbeta : float
    """
    fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
    precision = precision_score(y, preds, zero_division=1)
    recall = recall_score(y, preds, zero_division=1)
    return precision, recall, fbeta

In [9]:
# Optional: implement hyperparameter tuning.
def train_model(X_train, y_train, models):
    """
    Trains a machine learning model and returns it.

    Inputs
    ------
    X_train : np.array
        Training data.
    y_train : np.array
        Labels.
    Returns
    -------
    model
        Trained machine learning model.
    """

    for key in models.keys():

        models[key].fit(X_train, y_train)

    return models

In [10]:
# Get the processed train data
X_train, y_train, encoder, lb = process_data(
                        train, categorical_features=categorical_features, label="salary", training=True
                    )

In [11]:
# Create a dictionary for differents models
models = {}
models['Logistic Regression'] = LogisticRegression()
models['Extrat Classfier'] = ExtraTreesClassifier(n_estimators=50)
models['Support Vector Machines'] = LinearSVC()
models['Gradient Boosting'] = GradientBoostingClassifier(n_estimators=333, learning_rate=0.8, max_depth=5, random_state=0)
models['Decision Trees'] = DecisionTreeClassifier()
models['Random Forest'] = RandomForestClassifier()
models['XGB Classifier'] = XGBClassifier(objective='binary:logistic', eta=0.3, max_depth= 5, eval_metric = 'aucpr')
models['Naive Bayes'] = GaussianNB()
models['K-Nearest Neighbor'] = KNeighborsClassifier()

In [12]:
# Train the model
trained_models = train_model(X_train, y_train, models)



In [13]:
# Get the result metrics into a dataframe
def df_model_results(trained_models, X_data, y_data):
    """
    Get model result into a dataframe
    
    Inputs:
    -------
    trained_models : dict
                A dictionary of the trained models
    X_data : numpy.ndarray
        The data we want to evaluate the model on
    y_data : numpy.ndarray
        The actual label of the data
    Returns:
    --------
    df_model : pandas.core.frame.DataFrame
            The evalution results
    """
    fbeta, precision, recall = {}, {}, {}
    for key in trained_models.keys():

        predictions = trained_models[key].predict(X_data)

        fbeta[key] = fbeta_score(y_data, predictions, beta=1, zero_division=1)
        precision[key] = precision_score(predictions, y_data)
        recall[key] = recall_score(predictions, y_data)

    df_model = pd.DataFrame(index=models.keys(), columns=['fbeta', 'precision', 'recall'])
    df_model['fbeta'] = fbeta.values()
    df_model['precision'] = precision.values()
    df_model['recall'] = recall.values()

    return df_model

In [14]:
# Display the result of the model for the train data into a dataframe
df_train_results = df_model_results(trained_models, X_train, y_train)
df_train_results

Unnamed: 0,fbeta,precision,recall
Logistic Regression,0.384543,0.262508,0.718613
Extrat Classfier,0.99992,0.99984,1.0
Support Vector Machines,0.214951,0.122194,0.892272
Gradient Boosting,0.98025,0.974984,0.985573
Decision Trees,0.99992,0.99984,1.0
Random Forest,0.99984,0.99984,0.99984
XGB Classifier,0.757015,0.698685,0.825972
Naive Bayes,0.41861,0.306607,0.659538
K-Nearest Neighbor,0.564904,0.452213,0.752401


Extrat Classfier, Decision Trees and Random Forest have the best result on the training data

In [15]:
# Get the test processed data
X_test, y_test, encoder, lb = process_data(
                        test, categorical_features=categorical_features, 
                        label="salary", training=False, encoder=encoder, lb=binarizer
                    )

In [16]:
# Get the result metrics for the test data into a dataframe
df_test_results = df_model_results(trained_models, X_test, y_test)
df_test_results

Unnamed: 0,fbeta,precision,recall
Logistic Regression,0.404826,0.282595,0.713386
Extrat Classfier,0.649077,0.613849,0.688593
Support Vector Machines,0.253369,0.1466,0.93254
Gradient Boosting,0.682452,0.656269,0.710811
Decision Trees,0.63086,0.633812,0.627936
Random Forest,0.684318,0.628821,0.750558
XGB Classifier,0.720922,0.663132,0.789747
Naive Bayes,0.436333,0.328135,0.65099
K-Nearest Neighbor,0.418696,0.332502,0.565217


On the test set the XGB classifier has the best result. We will use Baysian Optimisation to get the best hyperparameter to improve the result for XGB Classifier

### Hyperparameter tuning for XGB Classifier

In [17]:
# Import the necessary libraries
from bayes_opt import BayesianOptimization
import xgboost as xgb

In [18]:
def bo_tune_xgb(max_depth, gamma, eta):
    """
    Function with the internals we wish to maximize
    
    Inputs
    ------
    max_depth : tuple
            Range of maximum depth of a tree.
    gamma : tuple
        Range of minimum loss reduction required to make a further 
        partition on a leaf node of the tree.
    eta : tuple
        Range of step size shrinkage used in update to prevents overfitting. 
                
    """
    # Define the value range for the parameters
    params = {
        'objective': 'binary:logistic',
        'max_depth': int(max_depth),
        'eta': eta,
        'eval_metric': 'aucpr'
    }
    
    #Cross validating with the specified parameters in 5 folds and 70 iterations
    cv_result = xgb.cv(params, training_xgb_matrix, num_boost_round=70, nfold=5)
    #Return the resul
    cv_result = cv_result['train-aucpr-mean'].iloc[-1]
    return 1.0 * cv_result

In [19]:
# Instantiate a BayesianOptimization
xgb_bo = BayesianOptimization(
    bo_tune_xgb, {
        'max_depth': (3, 7),
        'gamma': (0, 1),
        'eta': (0.01, 0.4),
    }
)

In [20]:
# Group the train data into a xgb.DMatrix
training_xgb_matrix = xgb.DMatrix(X_train, label=y_train)
test_xgb_matrix = xgb.DMatrix(X_test, label=y_test)

In [21]:
# Run the the optimization
xgb_bo.maximize(n_iter=6, init_points=8)

|   iter    |  target   |    eta    |   gamma   | max_depth |
-------------------------------------------------------------
| [0m1        [0m | [0m0.8942   [0m | [0m0.3361   [0m | [0m0.8429   [0m | [0m6.482    [0m |
| [0m2        [0m | [0m0.8276   [0m | [0m0.2018   [0m | [0m0.7963   [0m | [0m3.78     [0m |
| [0m3        [0m | [0m0.7714   [0m | [0m0.03079  [0m | [0m0.4343   [0m | [0m3.8      [0m |
| [0m4        [0m | [0m0.8411   [0m | [0m0.3269   [0m | [0m0.986    [0m | [0m3.322    [0m |
| [0m5        [0m | [0m0.7874   [0m | [0m0.05831  [0m | [0m0.5886   [0m | [0m3.396    [0m |
| [0m6        [0m | [0m0.863    [0m | [0m0.3736   [0m | [0m0.7115   [0m | [0m4.303    [0m |
| [0m7        [0m | [0m0.8575   [0m | [0m0.1159   [0m | [0m0.5573   [0m | [0m6.26     [0m |
| [0m8        [0m | [0m0.8031   [0m | [0m0.05215  [0m | [0m0.9851   [0m | [0m4.228    [0m |
| [95m9        [0m | [95m0.9012   [0m | [95m0.3885   [

In [22]:
# Show the best hypermarameters
params = xgb_bo.max['params']
print(params)

{'eta': 0.4, 'gamma': 0.0, 'max_depth': 7.0}


In [23]:
# Retrain the model with the best hyperparameter
params = {
    'objective': 'binary:logistic',
    'eta': round(params['eta'], 1),
    'max_depth': round(params['max_depth']),
    'gamma': round(params['gamma']),
    'eval_metric': 'aucpr',
}

# Create a list of xgb.DMatrix
watch_list = [
                (test_xgb_matrix, 'eval'),
                (training_xgb_matrix, 'train')
            ]

# Train the model with the selected hyperparameters
xgb_model = xgb.train(params,
                          training_xgb_matrix,
                          num_boost_round=999,
                          evals=watch_list,
                          early_stopping_rounds=20)

[0]	eval-aucpr:0.77350	train-aucpr:0.77043
[1]	eval-aucpr:0.79435	train-aucpr:0.79564
[2]	eval-aucpr:0.80644	train-aucpr:0.81153
[3]	eval-aucpr:0.80829	train-aucpr:0.81502
[4]	eval-aucpr:0.81316	train-aucpr:0.82061
[5]	eval-aucpr:0.81560	train-aucpr:0.82458
[6]	eval-aucpr:0.81805	train-aucpr:0.82832
[7]	eval-aucpr:0.81865	train-aucpr:0.83145
[8]	eval-aucpr:0.81965	train-aucpr:0.83581
[9]	eval-aucpr:0.81998	train-aucpr:0.83716
[10]	eval-aucpr:0.82251	train-aucpr:0.83926
[11]	eval-aucpr:0.82356	train-aucpr:0.84159
[12]	eval-aucpr:0.82467	train-aucpr:0.84316
[13]	eval-aucpr:0.82968	train-aucpr:0.85013
[14]	eval-aucpr:0.83047	train-aucpr:0.85362
[15]	eval-aucpr:0.83015	train-aucpr:0.85416
[16]	eval-aucpr:0.83040	train-aucpr:0.85695
[17]	eval-aucpr:0.83073	train-aucpr:0.86047
[18]	eval-aucpr:0.83360	train-aucpr:0.86259
[19]	eval-aucpr:0.83381	train-aucpr:0.86356
[20]	eval-aucpr:0.83483	train-aucpr:0.86505
[21]	eval-aucpr:0.83494	train-aucpr:0.86594
[22]	eval-aucpr:0.83497	train-aucpr:0.8665

In [24]:
def classify_type(y_pred, y_label):
    """
    Get classification type
    
    Inputs:
    -------
    y_pred : numpy.ndarray
        The predictions result
    y_label : numpy.ndarray
        The actual label
    Returns:
    --------
    rs : str
        The clasification type
    """
    
    rs = 'TP' if y_pred == 1 and y_label == 1 else 'FP' if y_pred == 1 and y_label == 0 else 'TN' if y_pred == 0 and y_label == 0 else 'FN'
    return rs

def evaluation(threshold):
    """
    Get the evaluation result
    Inputs:
    -------
    threshold: float
            The value that we need to compare the prediction result with
    Returns:
    --------
    precision : float
            The model precision
    recall : float
        The recall ratio
    fscore : float
        The F1 score
    accuracy : float
        The accuracy of the model
    y_test: numpy.ndarray
        The test label
    y_predict : numpy.ndarray
            The predicted label
    """
    test_evaluation = test.copy()
    predictions = xgb_model.predict(test_xgb_matrix)
    test_evaluation['label'] = test_evaluation.apply(lambda x: 0 if x['salary'] == "<=50k" else 1, axis=1)
    test_evaluation['predicted_score'] = predictions
    test_evaluation['predicted_label'] = test_evaluation.apply(lambda x: 1 if x['predicted_score'] >= threshold else 0, axis = 1)
    test_evaluation['type'] = test_evaluation.apply(lambda x: classify_type(x['predicted_label'], x['label']), axis = 1)
    y_predict = test_evaluation['predicted_label'].tolist()
    precision, recall, fscore, support = score(y_test, y_predict)
    accuracy = accuracy_score(y_test, y_predict)

    return round(recall[1], 2), round(precision[1], 2), round(fscore[1], 2), round(accuracy, 2), y_test, y_predict, support

In [25]:
# Get the evaluation result
threshold = 0.5
recall, precision, fscore, accuracy, y_test, y_predict, support = evaluation(threshold)

In [26]:
# Print the evaluation result
print('recall: {}'.format(recall))
print('precision: {}'.format(precision))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print('accuracy: {}'.format(accuracy))

recall: 0.64
precision: 0.73
fscore: 0.68
support: [4905 1603]
accuracy: 0.85


In [28]:
# Print the classification report result
print(classification_report(y_test, y_predict, target_names=['0', '1']))

              precision    recall  f1-score   support

           0       0.89      0.92      0.90      4905
           1       0.73      0.64      0.68      1603

    accuracy                           0.85      6508
   macro avg       0.81      0.78      0.79      6508
weighted avg       0.85      0.85      0.85      6508



We noticed that the model give better result for the label 0 than the label 1

In [29]:
# Show the distribution of the salary
test.groupby(['salary']).agg(person_count=("salary", "count")).reset_index()

Unnamed: 0,salary,person_count
0,<=50K,4905
1,>50K,1603


This model can be improve if we do explainable AI to find what is wrong with the data, maybe find the bias and analyse the False Positive and the False negative to know why we get thos results.